Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/torch7.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSoumith Chintala <soumith@gmail.com>2017-01-01 20:34:46 +0300
committerGitHub <noreply@github.com>2017-01-01 20:34:46 +0300
commit306984ff623886b8c07c8934623bd94fb52ed983 (patch)
tree75b824745614ef5a84d684dd9321981914e4930b
parentd41580eccefcbc1d11d404e0c4ae522560f8e263 (diff)
parentbe986fb5d8611e9cb158b69d0977564e41eb93f2 (diff)
Merge branch 'master' into contiguous-cat-1d
-rw-r--r--FFI.lua145
-rw-r--r--Storage.c3
-rw-r--r--Tensor.c3
-rw-r--r--Tensor.lua18
-rw-r--r--TensorMath.lua65
-rw-r--r--Tester.lua1
-rwxr-xr-xdoc/maths.md37
-rw-r--r--generic/Storage.c4
-rw-r--r--generic/Tensor.c49
-rw-r--r--generic/TensorOperator.c10
-rw-r--r--generic/luaG.h30
-rw-r--r--init.c7
-rw-r--r--lib/TH/CMakeLists.txt31
-rw-r--r--lib/TH/THDiskFile.c12
-rw-r--r--lib/TH/THFile.c3
-rw-r--r--lib/TH/THFile.h7
-rw-r--r--lib/TH/THFilePrivate.h7
-rw-r--r--lib/TH/THGenerateAllTypes.h1
-rw-r--r--lib/TH/THGenerateHalfType.h19
-rw-r--r--lib/TH/THHalf.c138
-rw-r--r--lib/TH/THHalf.h40
-rw-r--r--lib/TH/THMemoryFile.c14
-rw-r--r--lib/TH/THStorage.c6
-rw-r--r--lib/TH/THStorage.h6
-rw-r--r--lib/TH/THTensor.c6
-rw-r--r--lib/TH/THTensor.h6
-rw-r--r--lib/TH/THVector.c5
-rw-r--r--lib/TH/generic/THBlas.c11
-rw-r--r--lib/TH/generic/THStorageCopy.c47
-rw-r--r--lib/TH/generic/THStorageCopy.h1
-rw-r--r--lib/TH/generic/THTensorConv.c6
-rw-r--r--lib/TH/generic/THTensorConv.h3
-rw-r--r--lib/TH/generic/THTensorCopy.c26
-rw-r--r--lib/TH/generic/THTensorCopy.h1
-rw-r--r--lib/TH/generic/THTensorMath.c50
-rw-r--r--lib/TH/generic/THTensorMath.h4
-rw-r--r--lib/TH/generic/THVectorDispatch.c31
-rw-r--r--lib/TH/generic/simd/simd.h57
-rw-r--r--lib/TH/vector/VSX.c1915
-rw-r--r--test/test.lua173
-rw-r--r--test/test_half.lua55
-rw-r--r--test/test_writeObject.lua11
-rw-r--r--torchcwrap.lua55
43 files changed, 2899 insertions, 220 deletions
diff --git a/FFI.lua b/FFI.lua
index 3cc0b21..333e7b5 100644
--- a/FFI.lua
+++ b/FFI.lua
@@ -15,6 +15,7 @@ local function checkArgumentType(expected, actual, fn, ud, level)
end
if ok then
+
local Real2real = {
Byte='unsigned char',
Char='char',
@@ -22,7 +23,8 @@ if ok then
Int='int',
Long='long',
Float='float',
- Double='double'
+ Double='double',
+ Half='THHalf'
}
-- Allocator
@@ -34,6 +36,14 @@ typedef struct THAllocator {
} THAllocator;
]]
+ -- Half
+ ffi.cdef[[
+typedef struct {
+ unsigned short x;
+} __THHalf;
+typedef __THHalf THHalf;
+]]
+
-- Storage
for Real, real in pairs(Real2real) do
@@ -76,7 +86,7 @@ typedef struct THRealTensor
long *size;
long *stride;
int nDimension;
-
+
THRealStorage *storage;
ptrdiff_t storageOffset;
int refcount;
@@ -88,7 +98,8 @@ typedef struct THRealTensor
cdefs = cdefs:gsub('Real', Real):gsub('real', real)
ffi.cdef(cdefs)
- local Tensor = torch.getmetatable(string.format('torch.%sTensor', Real))
+ local Tensor_type = string.format('torch.%sTensor', Real)
+ local Tensor = torch.getmetatable(Tensor_type)
local Tensor_tt = ffi.typeof('TH' .. Real .. 'Tensor**')
rawset(Tensor,
@@ -107,75 +118,77 @@ typedef struct THRealTensor
end)
-- faster apply (contiguous case)
- local apply = Tensor.apply
- rawset(Tensor,
- "apply",
- function(self, func)
- if self:isContiguous() and self.data then
- local self_d = self:data()
- for i=0,self:nElement()-1 do
- local res = func(tonumber(self_d[i])) -- tonumber() required for long...
- if res then
- self_d[i] = res
+ if Tensor_type ~= 'torch.HalfTensor' then
+ local apply = Tensor.apply
+ rawset(Tensor,
+ "apply",
+ function(self, func)
+ if self:isContiguous() and self.data then
+ local self_d = self:data()
+ for i=0,self:nElement()-1 do
+ local res = func(tonumber(self_d[i])) -- tonumber() required for long...
+ if res then
+ self_d[i] = res
+ end
end
+ return self
+ else
+ return apply(self, func)
end
- return self
- else
- return apply(self, func)
- end
- end)
-
- -- faster map (contiguous case)
- local map = Tensor.map
- rawset(Tensor,
- "map",
- function(self, src, func)
- checkArgument(torch.isTensor(src), "map", 1, "tensor expected")
- checkArgumentType(self:type(), src:type(), "map", 1)
-
- if self:isContiguous() and src:isContiguous() and self.data and src.data then
- local self_d = self:data()
- local src_d = src:data()
- assert(src:nElement() == self:nElement(), 'size mismatch')
- for i=0,self:nElement()-1 do
- local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long...
- if res then
- self_d[i] = res
+ end)
+
+ -- faster map (contiguous case)
+ local map = Tensor.map
+ rawset(Tensor,
+ "map",
+ function(self, src, func)
+ checkArgument(torch.isTensor(src), "map", 1, "tensor expected")
+ checkArgumentType(self:type(), src:type(), "map", 1)
+
+ if self:isContiguous() and src:isContiguous() and self.data and src.data then
+ local self_d = self:data()
+ local src_d = src:data()
+ assert(src:nElement() == self:nElement(), 'size mismatch')
+ for i=0,self:nElement()-1 do
+ local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long...
+ if res then
+ self_d[i] = res
+ end
end
+ return self
+ else
+ return map(self, src, func)
end
- return self
- else
- return map(self, src, func)
- end
- end)
-
- -- faster map2 (contiguous case)
- local map2 = Tensor.map2
- rawset(Tensor,
- "map2",
- function(self, src1, src2, func)
- checkArgument(torch.isTensor(src1), "map", 1, "tensor expected")
- checkArgument(torch.isTensor(src2), "map", 2, "tensor expected")
- checkArgumentType(self:type(), src1:type(), "map", 1)
- checkArgumentType(self:type(), src2:type(), "map", 2)
-
- if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then
- local self_d = self:data()
- local src1_d = src1:data()
- local src2_d = src2:data()
- assert(src1:nElement() == self:nElement(), 'size mismatch')
- assert(src2:nElement() == self:nElement(), 'size mismatch')
- for i=0,self:nElement()-1 do
- local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long...
- if res then
- self_d[i] = res
+ end)
+
+ -- faster map2 (contiguous case)
+ local map2 = Tensor.map2
+ rawset(Tensor,
+ "map2",
+ function(self, src1, src2, func)
+ checkArgument(torch.isTensor(src1), "map", 1, "tensor expected")
+ checkArgument(torch.isTensor(src2), "map", 2, "tensor expected")
+ checkArgumentType(self:type(), src1:type(), "map", 1)
+ checkArgumentType(self:type(), src2:type(), "map", 2)
+
+ if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then
+ local self_d = self:data()
+ local src1_d = src1:data()
+ local src2_d = src2:data()
+ assert(src1:nElement() == self:nElement(), 'size mismatch')
+ assert(src2:nElement() == self:nElement(), 'size mismatch')
+ for i=0,self:nElement()-1 do
+ local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long...
+ if res then
+ self_d[i] = res
+ end
end
+ return self
+ else
+ return map2(self, src1, src2, func)
end
- return self
- else
- return map2(self, src1, src2, func)
- end
- end)
+ end)
+ end
end
-- torch.data
diff --git a/Storage.c b/Storage.c
index 28c4e87..874838e 100644
--- a/Storage.c
+++ b/Storage.c
@@ -7,3 +7,6 @@
#include "generic/Storage.c"
#include "THGenerateAllTypes.h"
+
+#include "generic/Storage.c"
+#include "THGenerateHalfType.h"
diff --git a/Tensor.c b/Tensor.c
index 4bfbc6a..bf78d1a 100644
--- a/Tensor.c
+++ b/Tensor.c
@@ -7,3 +7,6 @@
#include "generic/Tensor.c"
#include "THGenerateAllTypes.h"
+
+#include "generic/Tensor.c"
+#include "THGenerateHalfType.h"
diff --git a/Tensor.lua b/Tensor.lua
index b4b3e95..9a8215b 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -5,14 +5,14 @@ local Storage = {}
local Tensor = {}
-- types
-local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Double'}
+local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Half', 'Double'}
-- Lua 5.2 compatibility
local log10 = math.log10 or function(x) return math.log(x, 10) end
-- tostring() functions for Tensor and Storage
local function Storage__printformat(self)
- if self:size() == 0 then
+ if self:size() == 0 then
return "", nil, 0
end
local intMode = true
@@ -277,6 +277,10 @@ function Tensor.double(self)
return self:type('torch.DoubleTensor')
end
+function Tensor.half(self)
+ return self:type('torch.HalfTensor')
+end
+
function Tensor.real(self)
return self:type(torch.getdefaulttensortype())
end
@@ -556,6 +560,14 @@ torch.permute = Tensor.permute
for _,type in ipairs(types) do
local metatable = torch.getmetatable('torch.' .. type .. 'Tensor')
for funcname, func in pairs(Tensor) do
- rawset(metatable, funcname, func)
+ if funcname ~= 'totable' or type ~='Half' then
+ rawset(metatable, funcname, func)
+ else
+ local function Tensor__totable(self)
+ local host_tensor = self:float()
+ return self:float():totable()
+ end
+ rawset(torch.getmetatable('torch.HalfTensor'), 'totable', Tensor__totable)
+ end
end
end
diff --git a/TensorMath.lua b/TensorMath.lua
index 5971a7b..d816740 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -6,56 +6,7 @@ local interface = wrap.CInterface.new()
local method = wrap.CInterface.new()
local argtypes = wrap.CInterface.argtypes
-argtypes['ptrdiff_t'] = {
-
- helpname = function(arg)
- return 'ptrdiff_t'
- end,
-
- declare = function(arg)
- -- if it is a number we initialize here
- local default = tonumber(tostring(arg.default)) or 0
- return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
- end,
-
- check = function(arg, idx)
- return string.format("lua_isnumber(L, %d)", idx)
- end,
-
- read = function(arg, idx)
- return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, 'ptrdiff_t', idx)
- end,
-
- init = function(arg)
- -- otherwise do it here
- if arg.default then
- local default = tostring(arg.default)
- if not tonumber(default) then
- return string.format("arg%d = %s;", arg.i, default)
- end
- end
- end,
-
- carg = function(arg)
- return string.format('arg%d', arg.i)
- end,
-
- creturn = function(arg)
- return string.format('arg%d', arg.i)
- end,
-
- precall = function(arg)
- if arg.returned then
- return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
- end
- end,
-
- postcall = function(arg)
- if arg.creturned then
- return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
- end
- end
-}
+argtypes['ptrdiff_t'] = wrap.types.ptrdiff_t
interface:print([[
#include "TH.h"
@@ -216,6 +167,7 @@ local reals = {ByteTensor='unsigned char',
IntTensor='int',
LongTensor='long',
FloatTensor='float',
+ HalfTensor='half',
DoubleTensor='double'}
local accreals = {ByteTensor='long',
@@ -224,11 +176,12 @@ local accreals = {ByteTensor='long',
IntTensor='long',
LongTensor='long',
FloatTensor='double',
+ HalfTensor='float',
DoubleTensor='double'}
for _,Tensor in ipairs({"ByteTensor", "CharTensor",
"ShortTensor", "IntTensor", "LongTensor",
- "FloatTensor", "DoubleTensor"}) do
+ "FloatTensor", "HalfTensor", "DoubleTensor"}) do
local real = reals[Tensor]
local accreal = accreals[Tensor]
@@ -257,6 +210,7 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
end
end
+ if Tensor ~= 'HalfTensor' then
wrap("zero",
cname("zero"),
{{name=Tensor, returned=true}})
@@ -1030,6 +984,7 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
cname("nonzero"),
{{name="IndexTensor", default=true, returned=true},
{name=Tensor}})
+ end -- ~= HalfTensor
if Tensor == 'ByteTensor' then
-- Logical accumulators only apply to ByteTensor
@@ -1089,6 +1044,14 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
{name="double",default=0},
{name="double",default=0}})
+ wrap("bhistc",
+ cname("bhistc"),
+ {{name=Tensor, default=true, returned=true},
+ {name=Tensor},
+ {name="long",default=100},
+ {name="double",default=0},
+ {name="double",default=0}})
+
wrap("norm",
cname("normall"),
{{name=Tensor},
diff --git a/Tester.lua b/Tester.lua
index f512edb..6509413 100644
--- a/Tester.lua
+++ b/Tester.lua
@@ -687,6 +687,7 @@ local typesMatching = {
['torch.LongStorage'] = torch.LongTensor,
['torch.FloatStorage'] = torch.FloatTensor,
['torch.DoubleStorage'] = torch.DoubleTensor,
+ ['torch.HalfStorage'] = torch.HalfTensor,
}
--[[ Tests for storage equality.
diff --git a/doc/maths.md b/doc/maths.md
index 44e5ea6..b4f1592 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -158,6 +158,43 @@ By default the elements are sorted into 100 equally spaced bins between the mini
`y = torch.histc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
+<a name="torch.bhistc"></a>
+### [res] torch.bhistc([res,] x [,nbins, min_value, max_value]) ###
+<a name="torch.bhistc"></a>
+
+`y = torch.bhistc(x)` returns the histogram of the elements in 2d tensor `x` along the last dimension.
+By default the elements are sorted into 100 equally spaced bins between the minimum and maximum values of `x`.
+
+`y = torch.bhistc(x, n)` same as above with `n` bins.
+
+`y = torch.bhistc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
+
+```lua
+x =torch.Tensor(3, 6)
+
+> x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
+> x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 }
+> x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 }
+
+> x
+ 2 4 2 2 5 4
+ 3 5 1 5 3 5
+ 3 4 2 5 5 1
+[torch.DoubleTensor of size 3x6]
+
+> torch.bhistc(x, 5, 1, 5)
+ 0 3 0 2 1
+ 1 0 2 0 3
+ 1 1 1 1 2
+[torch.DoubleTensor of size 3x5]
+
+> y = torch.Tensor(1, 6):copy(x[1])
+
+> torch.bhistc(y, 5)
+ 3 0 2 0 1
+[torch.DoubleTensor of size 1x5]
+```
+
<a name="torch.linspace"></a>
### [res] torch.linspace([res,] x1, x2, [,n]) ###
<a name="torch.linspace"></a>
diff --git a/generic/Storage.c b/generic/Storage.c
index 134dc63..a6652a5 100644
--- a/generic/Storage.c
+++ b/generic/Storage.c
@@ -41,7 +41,7 @@ static int torch_Storage_(new)(lua_State *L)
THStorage_(free)(storage);
luaL_error(L, "element at index %d is not a number", i);
}
- THStorage_(set)(storage, i-1, (real)lua_tonumber(L, -1));
+ THStorage_(set)(storage, i-1, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
lua_pop(L, 1);
}
}
@@ -131,6 +131,8 @@ static int torch_Storage_(copy)(lua_State *L)
THStorage_(copyFloat)(storage, src);
else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
THStorage_(copyDouble)(storage, src);
+ else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+ THStorage_(copyHalf)(storage, src);
else
luaL_typerror(L, 2, "torch.*Storage");
lua_settop(L, 1);
diff --git a/generic/Tensor.c b/generic/Tensor.c
index abb7819..c2417fe 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -142,7 +142,7 @@ static int torch_Tensor_(new)(lua_State *L)
THTensor_(free)(tensor);
THError("invalid element (not a number)");
}
- THStorage_(set)(THTensor_(storage)(tensor), si++, (real)lua_tonumber(L, -1));
+ THStorage_(set)(THTensor_(storage)(tensor), si++, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
lua_pop(L, 1);
}
@@ -384,6 +384,7 @@ static int torch_Tensor_(select)(lua_State *L)
return 1;
}
+#ifndef TH_REAL_IS_HALF
static int torch_Tensor_(indexSelect)(lua_State *L)
{
int narg = lua_gettop(L);
@@ -567,6 +568,7 @@ static int torch_Tensor_(maskedFill)(lua_State *L)
return 1;
}
+#endif
static int torch_Tensor_(transpose)(lua_State *L)
{
@@ -675,6 +677,8 @@ static int torch_Tensor_(copy)(lua_State *L)
THTensor_(copyFloat)(tensor, src);
else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
THTensor_(copyDouble)(tensor, src);
+ else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) )
+ THTensor_(copyHalf)(tensor, src);
else
luaL_typerror(L, 2, "torch.*Tensor");
lua_settop(L, 1);
@@ -700,10 +704,14 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
THArgCheck(index >= 0 && index < tensor->size[0], 2, "out of range");
THStorage_(set)(tensor->storage, tensor->storageOffset+index*tensor->stride[0], value);
} else {
+#ifndef TH_REAL_IS_HALF
tensor = THTensor_(newWithTensor)(tensor);
THTensor_(narrow)(tensor, NULL, 0, index, 1);
THTensor_(fill)(tensor, value);
THTensor_(free)(tensor);
+#else
+ THError("fill on torch.HalfTensor not yet supported");
+#endif
}
} else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
tensor = THTensor_(newWithTensor)(tensor);
@@ -745,6 +753,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
THTensor_(narrow)(tensor, NULL, 0, index, 1);
THTensor_(copyDouble)(tensor, src);
THTensor_(free)(tensor);
+ } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+ tensor = THTensor_(newWithTensor)(tensor);
+ THTensor_(narrow)(tensor, NULL, 0, index, 1);
+ THTensor_(copyHalf)(tensor, src);
+ THTensor_(free)(tensor);
} else {
luaL_typerror(L, 3, "torch.*Tensor");
}
@@ -829,7 +842,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
/* doing a copy */
void *src;
if (lua_isnumber(L,3)) {
- THTensor_(fill)(tensor, lua_tonumber(L,3));
+#ifndef TH_REAL_IS_HALF
+ THTensor_(fill)(tensor, LUA_NUMBER_TO_REAL(lua_tonumber(L,3)));
+#else
+ THError("fill on torch.HalfTensor not yet supported");
+#endif
} else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
THTensor_(copy)(tensor, src);
} else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
@@ -846,6 +863,8 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
THTensor_(copyFloat)(tensor, src);
} else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
THTensor_(copyDouble)(tensor, src);
+ } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+ THTensor_(copyHalf)(tensor, src);
} else {
luaL_typerror(L, 3, "torch.*Tensor");
}
@@ -855,6 +874,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
}
else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
{
+#ifndef TH_REAL_IS_HALF
THTensor *vals;
if (lua_isnumber(L, 3))
{
@@ -868,6 +888,9 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
{
THError("number or " torch_Tensor " expected");
}
+#else
+ THError("ByteTensor indexing not yet supported with half types");
+#endif
}
else
lua_pushboolean(L, 0);
@@ -916,7 +939,7 @@ static int torch_Tensor_(__index__)(lua_State *L)
THArgCheck((z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
index += z*tensor->stride[dim];
}
- luaG_(pushreal)(L, (double)THStorage_(get)(THTensor_(storage)(tensor), index));
+ luaG_(pushreal)(L, THStorage_(get)(THTensor_(storage)(tensor), index));
lua_pushboolean(L, 1);
return 2;
}
@@ -987,11 +1010,16 @@ static int torch_Tensor_(__index__)(lua_State *L)
}
else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
{
+#ifndef TH_REAL_IS_HALF
THTensor *vals = THTensor_(new)();
THTensor_(maskedSelect)(vals, tensor, mask);
luaT_pushudata(L, vals, torch_Tensor);
lua_pushboolean(L, 1);
return 2;
+#else
+ THError("ByteTensor based indexing not yetsupported with half type");
+ return 0;
+#endif
}
else
{
@@ -1131,6 +1159,7 @@ static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index
THArgCheck(0, index, "expecting number");
}
+#ifndef TH_REAL_IS_HALF
static int torch_Tensor_(apply)(lua_State *L)
{
THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
@@ -1143,7 +1172,7 @@ static int torch_Tensor_(apply)(lua_State *L)
lua_call(L, 1, 1);
if(lua_isnumber(L, 3))
{
- *tensor_data = (real)lua_tonumber(L, 3);
+ *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 3));
lua_pop(L, 1);
}
else if(lua_isnil(L, 3))
@@ -1169,7 +1198,7 @@ static int torch_Tensor_(map)(lua_State *L)
lua_call(L, 2, 1);
if(lua_isnumber(L, 4))
{
- *tensor_data = (real)lua_tonumber(L, 4);
+ *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 4));
lua_pop(L, 1);
}
else if(lua_isnil(L, 4))
@@ -1197,7 +1226,7 @@ static int torch_Tensor_(map2)(lua_State *L)
lua_call(L, 3, 1);
if(lua_isnumber(L, 5))
{
- *tensor_data = (real)lua_tonumber(L, 5);
+ *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 5));
lua_pop(L, 1);
}
else if(lua_isnil(L, 5))
@@ -1208,6 +1237,7 @@ static int torch_Tensor_(map2)(lua_State *L)
lua_settop(L, 1);
return 1;
}
+#endif
static int torch_Tensor_(factory)(lua_State *L)
{
@@ -1286,6 +1316,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
{"narrow", torch_Tensor_(narrow)},
{"sub", torch_Tensor_(sub)},
{"select", torch_Tensor_(select)},
+#ifndef TH_REAL_IS_HALF
{"index", torch_Tensor_(indexSelect)},
{"indexCopy", torch_Tensor_(indexCopy)},
{"indexAdd", torch_Tensor_(indexAdd)},
@@ -1293,6 +1324,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
{"maskedSelect", torch_Tensor_(maskedSelect)},
{"maskedCopy", torch_Tensor_(maskedCopy)},
{"maskedFill", torch_Tensor_(maskedFill)},
+#endif
{"transpose", torch_Tensor_(transpose)},
{"t", torch_Tensor_(t)},
{"unfold", torch_Tensor_(unfold)},
@@ -1302,9 +1334,11 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
{"isSize", torch_Tensor_(isSize)},
{"nElement", torch_Tensor_(nElement)},
{"copy", torch_Tensor_(copy)},
+#ifndef TH_REAL_IS_HALF
{"apply", torch_Tensor_(apply)},
{"map", torch_Tensor_(map)},
{"map2", torch_Tensor_(map2)},
+#endif
{"read", torch_Tensor_(read)},
{"write", torch_Tensor_(write)},
{"__index__", torch_Tensor_(__index__)},
@@ -1318,7 +1352,10 @@ void torch_Tensor_(init)(lua_State *L)
torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
luaT_setfuncs(L, torch_Tensor_(_), 0);
lua_pop(L, 1);
+#ifndef TH_REAL_IS_HALF
THVector_(vectorDispatchInit)();
+#endif
+
}
#endif
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
index eba6c81..e131c57 100644
--- a/generic/TensorOperator.c
+++ b/generic/TensorOperator.c
@@ -14,7 +14,7 @@ static int torch_TensorOperator_(__add__)(lua_State *L)
{
r = THTensor_(new)();
luaT_pushudata(L, r, torch_Tensor);
-
+
if(!tensor1 && tensor2)
{
THTensor_(resizeAs)(r, tensor2);
@@ -49,7 +49,7 @@ static int torch_TensorOperator_(__sub__)(lua_State *L)
{
r = THTensor_(new)();
luaT_pushudata(L, r, torch_Tensor);
-
+
if(!tensor1 && tensor2)
{
THTensor_(resizeAs)(r, tensor2);
@@ -98,7 +98,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
{
r = THTensor_(new)();
luaT_pushudata(L, r, torch_Tensor);
-
+
if(!tensor1 && tensor2)
{
THTensor_(resizeAs)(r, tensor2);
@@ -115,7 +115,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
{
int dimt = tensor1->nDimension;
int dims = tensor2->nDimension;
-
+
if(dimt == 1 && dims == 1)
lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
else if(dimt == 2 && dims == 1)
@@ -131,7 +131,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
THTensor_(addmm)(r, 1, r, 1, tensor1, tensor2);
}
else
- luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
+ luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
}
}
return 1;
diff --git a/generic/luaG.h b/generic/luaG.h
index 950eae9..f1ffce2 100644
--- a/generic/luaG.h
+++ b/generic/luaG.h
@@ -6,10 +6,24 @@
#define luaG_(NAME) TH_CONCAT_3(luaG_,Real,NAME)
#endif
-static void luaG_(pushreal)(lua_State *L, accreal n) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
- lua_pushnumber(L, (lua_Number)n);
-#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+#undef REAL_TO_LUA_NUMBER
+#undef LUA_NUMBER_TO_REAL
+
+#if defined(TH_REAL_IS_HALF)
+# define REAL_TO_LUA_NUMBER(n) (lua_Number)TH_half2float(n)
+# define LUA_NUMBER_TO_REAL(n) TH_float2half((lua_Number)n)
+#else
+# define REAL_TO_LUA_NUMBER(n) (lua_Number)(n)
+# define LUA_NUMBER_TO_REAL(n) (real)n
+#endif
+
+
+
+static void luaG_(pushreal)(lua_State *L, real n) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) || LUA_VERSION_NUM < 503
+ lua_pushnumber(L, REAL_TO_LUA_NUMBER(n));
+#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) \
+ || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
lua_pushinteger(L, (lua_Integer)n);
#else
#error "unhandled real type in luaG_pushreal"
@@ -17,8 +31,8 @@ static void luaG_(pushreal)(lua_State *L, accreal n) {
}
static real luaG_(checkreal)(lua_State *L, int idx) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
- return (lua_Number)luaL_checknumber(L, idx);
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+ return LUA_NUMBER_TO_REAL(luaL_checknumber(L, idx));
#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
int type = lua_type(L, idx);
if (type == LUA_TSTRING) {
@@ -38,8 +52,8 @@ static real luaG_(checkreal)(lua_State *L, int idx) {
}
static real luaG_(optreal)(lua_State *L, int idx, real n) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
- return (lua_Number)luaL_optnumber(L, idx, (lua_Number)n);
+#if defined(TH_REAL_IS_HALF) || defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
+ return LUA_NUMBER_TO_REAL(luaL_optnumber(L, idx, REAL_TO_LUA_NUMBER(n)));
#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
return (lua_Integer)luaL_optinteger(L, idx, (lua_Integer)n);
#else
diff --git a/init.c b/init.c
index 08eedba..3bdac17 100644
--- a/init.c
+++ b/init.c
@@ -16,6 +16,7 @@ extern void torch_IntStorage_init(lua_State *L);
extern void torch_LongStorage_init(lua_State *L);
extern void torch_FloatStorage_init(lua_State *L);
extern void torch_DoubleStorage_init(lua_State *L);
+extern void torch_HalfStorage_init(lua_State *L);
extern void torch_ByteTensor_init(lua_State *L);
extern void torch_CharTensor_init(lua_State *L);
@@ -24,6 +25,7 @@ extern void torch_IntTensor_init(lua_State *L);
extern void torch_LongTensor_init(lua_State *L);
extern void torch_FloatTensor_init(lua_State *L);
extern void torch_DoubleTensor_init(lua_State *L);
+extern void torch_HalfTensor_init(lua_State *L);
extern void torch_ByteTensorOperator_init(lua_State *L);
extern void torch_CharTensorOperator_init(lua_State *L);
@@ -33,8 +35,10 @@ extern void torch_LongTensorOperator_init(lua_State *L);
extern void torch_FloatTensorOperator_init(lua_State *L);
extern void torch_DoubleTensorOperator_init(lua_State *L);
+
extern void torch_TensorMath_init(lua_State *L);
+
LUA_EXTERNC DLL_EXPORT int luaopen_libtorch(lua_State *L);
int luaopen_libtorch(lua_State *L)
@@ -45,7 +49,6 @@ int luaopen_libtorch(lua_State *L)
lua_setglobal(L, "torch");
torch_utils_init(L);
-
torch_File_init(L);
torch_ByteStorage_init(L);
@@ -55,6 +58,7 @@ int luaopen_libtorch(lua_State *L)
torch_LongStorage_init(L);
torch_FloatStorage_init(L);
torch_DoubleStorage_init(L);
+ torch_HalfStorage_init(L);
torch_ByteTensor_init(L);
torch_CharTensor_init(L);
@@ -63,6 +67,7 @@ int luaopen_libtorch(lua_State *L)
torch_LongTensor_init(L);
torch_FloatTensor_init(L);
torch_DoubleTensor_init(L);
+ torch_HalfTensor_init(L);
torch_ByteTensorOperator_init(L);
torch_CharTensorOperator_init(L);
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index e6cf91d..0b5f166 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -78,6 +78,21 @@ IF (CORTEXA9_FOUND)
SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
ENDIF (CORTEXA9_FOUND)
+INCLUDE(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+# Check for a cpuid intrinsic
+IF(HAVE_CPUID_H)
+ CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
+ int main()
+ {
+ unsigned int eax, ebx, ecx, edx;
+ return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+ }" HAVE_GCC_GET_CPUID)
+ENDIF()
+IF(HAVE_GCC_GET_CPUID)
+ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
+ENDIF(HAVE_GCC_GET_CPUID)
+
FIND_PACKAGE(SSE)
IF(C_SSE2_FOUND)
SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
@@ -122,11 +137,11 @@ IF(C_AVX_FOUND)
ENDIF(C_AVX_FOUND)
SET(hdr
- THGeneral.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
- THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h)
+ THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+ THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
SET(src
- THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+ THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
SET(src ${src} ${hdr} ${simd})
@@ -135,9 +150,13 @@ if(BUILD_STATIC)
ADD_LIBRARY(TH_static STATIC ${src})
endif()
+IF(NOT TH_SO_VERSION)
+ SET(TH_SO_VERSION 0)
+ENDIF(NOT TH_SO_VERSION)
+MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
SET_TARGET_PROPERTIES(TH PROPERTIES
- VERSION 0
- SOVERSION 0)
+ VERSION ${TH_SO_VERSION}
+ SOVERSION ${TH_SO_VERSION})
CHECK_C_SOURCE_RUNS("
#include <stdatomic.h>
@@ -320,6 +339,7 @@ INSTALL(FILES
THFilePrivate.h
${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
THGenerateAllTypes.h
+ THGenerateHalfType.h
THGenerateFloatTypes.h
THGenerateIntTypes.h
THLapack.h
@@ -333,6 +353,7 @@ INSTALL(FILES
THTensorMacros.h
THVector.h
THAtomic.h
+ THHalf.h
DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
INSTALL(FILES
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 9d9cbae..01b1951 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -348,14 +348,14 @@ READ_WRITE_METHODS(int, Int,
int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++,
int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++)
-/*READ_WRITE_METHODS(long, Long,
- int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++,
- int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++)*/
-
READ_WRITE_METHODS(float, Float,
int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
+READ_WRITE_METHODS(THHalf, Half,
+ float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
+ int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
+
READ_WRITE_METHODS(double, Double,
int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++)
@@ -618,6 +618,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
THDiskFile_readLong,
THDiskFile_readFloat,
THDiskFile_readDouble,
+ THDiskFile_readHalf,
THDiskFile_readString,
THDiskFile_writeByte,
@@ -627,6 +628,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
THDiskFile_writeLong,
THDiskFile_writeFloat,
THDiskFile_writeDouble,
+ THDiskFile_writeHalf,
THDiskFile_writeString,
THDiskFile_synchronize,
@@ -730,6 +732,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
THDiskFile_readLong,
THDiskFile_readFloat,
THDiskFile_readDouble,
+ THDiskFile_readHalf,
THDiskFile_readString,
THDiskFile_writeByte,
@@ -739,6 +742,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
THDiskFile_writeLong,
THDiskFile_writeFloat,
THDiskFile_writeDouble,
+ THDiskFile_writeHalf,
THDiskFile_writeString,
THDiskFile_synchronize,
diff --git a/lib/TH/THFile.c b/lib/TH/THFile.c
index c8913af..3717b7b 100644
--- a/lib/TH/THFile.c
+++ b/lib/TH/THFile.c
@@ -19,6 +19,7 @@ IMPLEMENT_THFILE_RW(Int, int)
IMPLEMENT_THFILE_RW(Long, long)
IMPLEMENT_THFILE_RW(Float, float)
IMPLEMENT_THFILE_RW(Double, double)
+IMPLEMENT_THFILE_RW(Half, THHalf)
size_t THFile_readStringRaw(THFile *self, const char *format, char **str_)
{
@@ -133,6 +134,7 @@ IMPLEMENT_THFILE_SCALAR(Int, int)
IMPLEMENT_THFILE_SCALAR(Long, long)
IMPLEMENT_THFILE_SCALAR(Float, float)
IMPLEMENT_THFILE_SCALAR(Double, double)
+IMPLEMENT_THFILE_SCALAR(Half, THHalf)
#define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE) \
size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \
@@ -152,3 +154,4 @@ IMPLEMENT_THFILE_STORAGE(Int, int)
IMPLEMENT_THFILE_STORAGE(Long, long)
IMPLEMENT_THFILE_STORAGE(Float, float)
IMPLEMENT_THFILE_STORAGE(Double, double)
+IMPLEMENT_THFILE_STORAGE(Half, THHalf)
diff --git a/lib/TH/THFile.h b/lib/TH/THFile.h
index 64dd2da..e097bdf 100644
--- a/lib/TH/THFile.h
+++ b/lib/TH/THFile.h
@@ -74,6 +74,13 @@ TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
+TH_API THHalf THFile_readHalfScalar(THFile *self);
+TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar);
+TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size);
+TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size);
+
TH_API void THFile_synchronize(THFile *self);
TH_API void THFile_seek(THFile *self, size_t position);
TH_API void THFile_seekEnd(THFile *self);
diff --git a/lib/TH/THFilePrivate.h b/lib/TH/THFilePrivate.h
index d268041..55169c3 100644
--- a/lib/TH/THFilePrivate.h
+++ b/lib/TH/THFilePrivate.h
@@ -1,3 +1,8 @@
+#include "THGeneral.h"
+
+#include "THHalf.h"
+
+
struct THFile__
{
struct THFileVTable *vtable;
@@ -23,6 +28,7 @@ struct THFileVTable
size_t (*readLong)(THFile *self, long *data, size_t n);
size_t (*readFloat)(THFile *self, float *data, size_t n);
size_t (*readDouble)(THFile *self, double *data, size_t n);
+ size_t (*readHalf)(THFile *self, THHalf *data, size_t n);
size_t (*readString)(THFile *self, const char *format, char **str_);
size_t (*writeByte)(THFile *self, unsigned char *data, size_t n);
@@ -32,6 +38,7 @@ struct THFileVTable
size_t (*writeLong)(THFile *self, long *data, size_t n);
size_t (*writeFloat)(THFile *self, float *data, size_t n);
size_t (*writeDouble)(THFile *self, double *data, size_t n);
+ size_t (*writeHalf)(THFile *self, THHalf *data, size_t n);
size_t (*writeString)(THFile *self, const char *str, size_t size);
void (*synchronize)(THFile *self);
diff --git a/lib/TH/THGenerateAllTypes.h b/lib/TH/THGenerateAllTypes.h
index 539629b..4a77081 100644
--- a/lib/TH/THGenerateAllTypes.h
+++ b/lib/TH/THGenerateAllTypes.h
@@ -8,7 +8,6 @@
#define THInf UCHAR_MAX
#define TH_REAL_IS_BYTE
#line 1 TH_GENERIC_FILE
-/*#line 1 "THByteStorage.h"*/
#include TH_GENERIC_FILE
#undef real
#undef accreal
diff --git a/lib/TH/THGenerateHalfType.h b/lib/TH/THGenerateHalfType.h
new file mode 100644
index 0000000..9acc534
--- /dev/null
+++ b/lib/TH/THGenerateHalfType.h
@@ -0,0 +1,19 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h"
+#endif
+
+#include "THHalf.h"
+#define real THHalf
+#define accreal float
+#define Real Half
+#define THInf TH_HALF_MAX
+#define TH_REAL_IS_HALF
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_HALF
+
+#undef TH_GENERIC_FILE
diff --git a/lib/TH/THHalf.c b/lib/TH/THHalf.c
new file mode 100644
index 0000000..b0b9bc5
--- /dev/null
+++ b/lib/TH/THHalf.c
@@ -0,0 +1,138 @@
+#include "THHalf.h"
+
+/*
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee. Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users. These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// Host functions for converting between FP32 and FP16 formats
+// Paulius Micikevicius (pauliusm@nvidia.com)
+
+float TH_half2float(THHalf h)
+{
+ unsigned sign = ((h.x >> 15) & 1);
+ unsigned exponent = ((h.x >> 10) & 0x1f);
+ unsigned mantissa = ((h.x & 0x3ff) << 13);
+
+ if (exponent == 0x1f) { /* NaN or Inf */
+ mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+ exponent = 0xff;
+ } else if (!exponent) { /* Denorm or Zero */
+ if (mantissa) {
+ unsigned int msb;
+ exponent = 0x71;
+ do {
+ msb = (mantissa & 0x400000);
+ mantissa <<= 1; /* normalize */
+ --exponent;
+ } while (!msb);
+ mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+ }
+ } else {
+ exponent += 0x70;
+ }
+
+ int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+ return *((float*)((void*)&temp));
+}
+
+THHalf TH_float2half(float f)
+{
+ THHalf ret;
+
+ unsigned x = *((int*)(void*)(&f));
+ unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+ unsigned sign, exponent, mantissa;
+
+ // Get rid of +NaN/-NaN case first.
+ if (u > 0x7f800000) {
+ ret.x = 0x7fffU;
+ return ret;
+ }
+
+ sign = ((x >> 16) & 0x8000);
+
+ // Get rid of +Inf/-Inf, +0/-0.
+ if (u > 0x477fefff) {
+ ret.x = sign | 0x7c00U;
+ return ret;
+ }
+ if (u < 0x33000001) {
+ ret.x = (sign | 0x0000);
+ return ret;
+ }
+
+ exponent = ((u >> 23) & 0xff);
+ mantissa = (u & 0x7fffff);
+
+ if (exponent > 0x70) {
+ shift = 13;
+ exponent -= 0x70;
+ } else {
+ shift = 0x7e - exponent;
+ exponent = 0;
+ mantissa |= 0x800000;
+ }
+ lsb = (1 << shift);
+ lsb_s1 = (lsb >> 1);
+ lsb_m1 = (lsb - 1);
+
+ // Round to nearest even.
+ remainder = (mantissa & lsb_m1);
+ mantissa >>= shift;
+ if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+ ++mantissa;
+ if (!(mantissa & 0x3ff)) {
+ ++exponent;
+ mantissa = 0;
+ }
+ }
+
+ ret.x = (sign | (exponent << 10) | mantissa);
+ return ret;
+}
diff --git a/lib/TH/THHalf.h b/lib/TH/THHalf.h
new file mode 100644
index 0000000..0549d21
--- /dev/null
+++ b/lib/TH/THHalf.h
@@ -0,0 +1,40 @@
+#ifndef TH_HALF_H
+#define TH_HALF_H
+
+#include "THGeneral.h"
+#include <stdint.h>
+
+/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
+#if defined(__GNUC__)
+#define __thalign__(n) __attribute__((aligned(n)))
+#elif defined(_WIN32)
+#define __thalign__(n) __declspec(align(n))
+#else
+#define __thalign__(n)
+#endif
+
+typedef struct __thalign__(2){
+ unsigned short x;
+} __THHalf;
+
+typedef struct __thalign__(4) {
+ unsigned int x;
+} __THHalf2;
+
+typedef __THHalf THHalf;
+typedef __THHalf2 THHalf2;
+
+/* numeric limits */
+
+
+TH_API THHalf TH_float2half(float a);
+TH_API float TH_half2float(THHalf a);
+
+#ifndef TH_HALF_BITS_TO_LITERAL
+# define TH_HALF_BITS_TO_LITERAL(n) { n }
+#endif
+
+#define TH_HALF_MAX TH_HALF_BITS_TO_LITERAL(0x7BFF)
+
+#undef __thalign__
+#endif
diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c
index 8d97621..ecce6e1 100644
--- a/lib/TH/THMemoryFile.c
+++ b/lib/TH/THMemoryFile.c
@@ -332,16 +332,18 @@ READ_WRITE_METHODS(int, Int,
nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]),
1)
-/*READ_WRITE_METHODS(long, Long,
- int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
- nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]),
- 1)*/
-
READ_WRITE_METHODS(float, Float,
int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]),
1)
+READ_WRITE_METHODS(THHalf, Half,
+ int nByteRead_; float buf; \
+ int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &buf, &nByteRead_); \
+ data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+ nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])),
+ 1)
+
READ_WRITE_METHODS(double, Double,
int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]),
@@ -621,6 +623,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
THMemoryFile_readLong,
THMemoryFile_readFloat,
THMemoryFile_readDouble,
+ THMemoryFile_readHalf,
THMemoryFile_readString,
THMemoryFile_writeByte,
@@ -630,6 +633,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
THMemoryFile_writeLong,
THMemoryFile_writeFloat,
THMemoryFile_writeDouble,
+ THMemoryFile_writeHalf,
THMemoryFile_writeString,
THMemoryFile_synchronize,
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index d18488e..bb63a43 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -4,5 +4,11 @@
#include "generic/THStorage.c"
#include "THGenerateAllTypes.h"
+#include "generic/THStorage.c"
+#include "THGenerateHalfType.h"
+
#include "generic/THStorageCopy.c"
#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.c"
+#include "THGenerateHalfType.h"
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index 36ed507..9565e10 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -14,7 +14,13 @@
#include "generic/THStorage.h"
#include "THGenerateAllTypes.h"
+#include "generic/THStorage.h"
+#include "THGenerateHalfType.h"
+
#include "generic/THStorageCopy.h"
#include "THGenerateAllTypes.h"
+#include "generic/THStorageCopy.h"
+#include "THGenerateHalfType.h"
+
#endif
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index 2878fc9..37071df 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -11,9 +11,15 @@
#include "generic/THTensor.c"
#include "THGenerateAllTypes.h"
+#include "generic/THTensor.c"
+#include "THGenerateHalfType.h"
+
#include "generic/THTensorCopy.c"
#include "THGenerateAllTypes.h"
+#include "generic/THTensorCopy.c"
+#include "THGenerateHalfType.h"
+
#include "generic/THTensorRandom.c"
#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h
index 6eddf9c..a155efd 100644
--- a/lib/TH/THTensor.h
+++ b/lib/TH/THTensor.h
@@ -16,9 +16,15 @@ typedef struct {
#include "generic/THTensor.h"
#include "THGenerateAllTypes.h"
+#include "generic/THTensor.h"
+#include "THGenerateHalfType.h"
+
#include "generic/THTensorCopy.h"
#include "THGenerateAllTypes.h"
+#include "generic/THTensorCopy.h"
+#include "THGenerateHalfType.h"
+
#include "THTensorMacros.h"
/* random numbers */
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
index 6179d89..907adbb 100644
--- a/lib/TH/THVector.c
+++ b/lib/TH/THVector.c
@@ -1,10 +1,15 @@
#include "THVector.h"
+
#include "generic/simd/simd.h"
#ifdef __NEON__
#include "vector/NEON.c"
#endif
+#ifdef __PPC64__
+#include "vector/VSX.c"
+#endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#include "vector/SSE.c"
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 6452f94..371df4d 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -2,6 +2,7 @@
#define TH_GENERIC_FILE "generic/THBlas.c"
#else
+
#ifdef BLAS_F2C
# define ffloat double
#else
@@ -24,8 +25,8 @@ TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, doubl
TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
-
-
+
+
void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
{
@@ -182,9 +183,9 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
{
if(n == 1)
lda = m;
-
+
#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
- if( (m <= INT_MAX) && (n <= INT_MAX) &&
+ if( (m <= INT_MAX) && (n <= INT_MAX) &&
(lda > 0) && (lda <= INT_MAX) &&
(incx > 0) && (incx <= INT_MAX) &&
(incy > 0) && (incy <= INT_MAX) )
@@ -224,7 +225,7 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
{
if(beta != 1)
THBlas_(scal)(m, beta, y, incy);
-
+
for(j = 0; j < n; j++)
{
real *column_ = a+lda*j;
diff --git a/lib/TH/generic/THStorageCopy.c b/lib/TH/generic/THStorageCopy.c
index 583e088..ce4b57e 100644
--- a/lib/TH/generic/THStorageCopy.c
+++ b/lib/TH/generic/THStorageCopy.c
@@ -15,16 +15,42 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
THStorage_(rawCopy)(storage, src->data);
}
-
#define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \
void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
{ \
- ptrdiff_t i; \
+ ptrdiff_t i; \
+ for(i = 0; i < storage->size; i++) \
+ storage->data[i] = (real)src->data[i]; \
+}
+
+#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+ THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+ ptrdiff_t i; \
+ for(i = 0; i < storage->size; i++) \
+ storage->data[i] = (real)TH_half2float(src->data[i]); \
+}
+
+#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
THArgCheck(storage->size == src->size, 2, "size mismatch"); \
- for(i = 0; i < storage->size; i++) \
- storage->data[i] = (real)src->data[i]; \
+ ptrdiff_t i; \
+ for(i = 0; i < storage->size; i++) \
+ storage->data[i] = TH_float2half((float)(src->data[i])); \
}
+#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+ THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+ ptrdiff_t i; \
+ for(i = 0; i < storage->size; i++) \
+ storage->data[i] = src->data[i]; \
+}
+
+#ifndef TH_REAL_IS_HALF
IMPLEMENT_THStorage_COPY(Byte)
IMPLEMENT_THStorage_COPY(Char)
IMPLEMENT_THStorage_COPY(Short)
@@ -32,5 +58,18 @@ IMPLEMENT_THStorage_COPY(Int)
IMPLEMENT_THStorage_COPY(Long)
IMPLEMENT_THStorage_COPY(Float)
IMPLEMENT_THStorage_COPY(Double)
+IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
+IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
+IMPLEMENT_THStorage_COPY_TO_HALF(Char)
+IMPLEMENT_THStorage_COPY_TO_HALF(Short)
+IMPLEMENT_THStorage_COPY_TO_HALF(Int)
+IMPLEMENT_THStorage_COPY_TO_HALF(Long)
+IMPLEMENT_THStorage_COPY_TO_HALF(Float)
+IMPLEMENT_THStorage_COPY_TO_HALF(Double)
+#endif
+
#endif
diff --git a/lib/TH/generic/THStorageCopy.h b/lib/TH/generic/THStorageCopy.h
index f853a82..ce8a2a6 100644
--- a/lib/TH/generic/THStorageCopy.h
+++ b/lib/TH/generic/THStorageCopy.h
@@ -13,5 +13,6 @@ TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src);
TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src);
TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src);
TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src);
+TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src);
#endif
diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c
index d98a2aa..1e21991 100644
--- a/lib/TH/generic/THTensorConv.c
+++ b/lib/TH/generic/THTensorConv.c
@@ -2,7 +2,6 @@
#define TH_GENERIC_FILE "generic/THTensorConv.c"
#else
-
/*
2D Input, 2D kernel : convolve given image with the given kernel.
*/
@@ -775,7 +774,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
/* get input */
real *ptr_input = input_data + p*istride0 + i*istride1;
-
+
/* do image, kernel convolution */
THTensor_(validXCorr2DRevptr)(ptr_output,
alpha,
@@ -1174,7 +1173,7 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
/* get input */
real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
-
+
/* do image, kernel convolution */
if (*vf == 'F')
if (*xc == 'X')
@@ -1955,5 +1954,4 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
THTensor_(free)(input);
THTensor_(free)(kernel);
}
-
#endif
diff --git a/lib/TH/generic/THTensorConv.h b/lib/TH/generic/THTensorConv.h
index d215fcd..79866f3 100644
--- a/lib/TH/generic/THTensorConv.h
+++ b/lib/TH/generic/THTensorConv.h
@@ -2,7 +2,6 @@
#define TH_GENERIC_FILE "generic/THTensorConv.h"
#else
-
TH_API void THTensor_(validXCorr2Dptr)(real *r_,
real alpha,
real *t_, long ir, long ic,
@@ -66,7 +65,7 @@ TH_API void THTensor_(fullConv3Dptr)(real *r_,
long st, long sr, long sc);
TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
- real alpha,
+ real alpha,
real *t_, long it, long ir, long ic,
real *k_, long kt, long kr, long kc,
long st, long sr, long sc);
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index ea6d6f1..d7228f6 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -13,6 +13,19 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)(*src_data);) \
}
+#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
+}
+
+#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)TH_half2float(*src_data);) \
+}
+
+#ifndef TH_REAL_IS_HALF
IMPLEMENT_THTensor_COPY(Byte, unsigned char)
IMPLEMENT_THTensor_COPY(Char, char)
IMPLEMENT_THTensor_COPY(Short, short)
@@ -20,5 +33,18 @@ IMPLEMENT_THTensor_COPY(Int, int)
IMPLEMENT_THTensor_COPY(Long, long)
IMPLEMENT_THTensor_COPY(Float, float)
IMPLEMENT_THTensor_COPY(Double, double)
+IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THTensor_COPY(Half, THHalf)
+IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char)
+IMPLEMENT_THTensor_COPY_TO_HALF(Char, char)
+IMPLEMENT_THTensor_COPY_TO_HALF(Short, short)
+IMPLEMENT_THTensor_COPY_TO_HALF(Int, int)
+IMPLEMENT_THTensor_COPY_TO_HALF(Long, long)
+IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
+IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
+
+#endif /* REAL_IS_HALF */
#endif
diff --git a/lib/TH/generic/THTensorCopy.h b/lib/TH/generic/THTensorCopy.h
index 8d03b22..b9e5bfc 100644
--- a/lib/TH/generic/THTensorCopy.h
+++ b/lib/TH/generic/THTensorCopy.h
@@ -12,5 +12,6 @@ TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src);
TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src);
TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src);
TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src);
+TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src);
#endif
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 9fc1577..78ee056 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -98,10 +98,15 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
long i = 0;
long dim;
long div = 1;
+#ifdef TH_REAL_IS_HALF
+#define IS_NONZERO(val) (TH_half2float(val)!=0)
+#else
+#define IS_NONZERO(val) ((val)!=0)
+#endif
/* First Pass to determine size of subscripts */
TH_TENSOR_APPLY(real, tensor,
- if (*tensor_data != 0) {
+ if IS_NONZERO(*tensor_data) {
++numel;
});
#ifdef DEBUG
@@ -112,7 +117,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
/* Second pass populates subscripts */
subscript_data = THLongTensor_data(subscript);
TH_TENSOR_APPLY(real, tensor,
- if (*tensor_data != 0) {
+ if IS_NONZERO(*tensor_data) {
div = 1;
for (dim = tensor->nDimension - 1; dim >= 0; dim--) {
@@ -396,6 +401,7 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
return sum;
}
+
#undef th_isnan
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
#define th_isnan(val) \
@@ -2596,5 +2602,45 @@ void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalu
);
}
+void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
+{
+ THArgCheck(THTensor_(nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimension)(tensor));
+
+ int dimension = 1;
+ THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(tensor), 2, "invalid dimension %d",
+ dimension + TH_INDEX_BASE);
+
+ real minval;
+ real maxval;
+ real *h_data;
+
+ THTensor_(resize2d)(hist, tensor->size[0], nbins);
+ THTensor_(zero)(hist);
+
+ minval = minvalue;
+ maxval = maxvalue;
+ if (minval == maxval)
+ {
+ minval = THTensor_(minall)(tensor);
+ maxval = THTensor_(maxall)(tensor);
+ }
+ if (minval == maxval)
+ {
+ minval = minval - 1;
+ maxval = maxval + 1;
+ }
+
+ TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, long i;
+ for(i = 0; i < tensor_size; i++)
+ {
+ if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
+ const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins);
+ hist_data[THMin(bin, nbins-1)] += 1;
+ }
+ }
+ );
+}
+
#endif /* floating point only part */
+#undef IS_NONZERO
#endif
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index 87f1616..c656dfd 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -2,8 +2,6 @@
#define TH_GENERIC_FILE "generic/THTensorMath.h"
#else
-
-
TH_API void THTensor_(fill)(THTensor *r_, real value);
TH_API void THTensor_(zero)(THTensor *r_);
@@ -163,6 +161,7 @@ TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension
TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
+TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
TH_API accreal THTensor_(meanall)(THTensor *self);
TH_API accreal THTensor_(varall)(THTensor *self);
@@ -173,7 +172,6 @@ TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n);
TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
-
#endif
#if defined(TH_REAL_IS_BYTE)
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index 6fd1d68..2436a12 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -20,6 +20,12 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(__PPC64__)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -32,7 +38,6 @@ void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
THVector_(fill_DISPATCHPTR)(x, c, n);
}
-
static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(add_DEFAULT);
static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
#if defined(__NEON__)
@@ -41,6 +46,12 @@ static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(__PPC64__)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(add_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -63,6 +74,12 @@ static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(__PPC64__)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(diff_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -85,6 +102,12 @@ static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(__PPC64__)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(scale_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -107,6 +130,12 @@ static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
#endif
#endif
+ #if defined(__PPC64__)
+ #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+ FUNCTION_IMPL(THVector_(mul_VSX), SIMDExtension_VSX),
+ #endif
+ #endif
+
#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
|| defined(USE_SSE4_1) || defined(USE_SSE4_2)
#if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
index b059258..ee53e2c 100644
--- a/lib/TH/generic/simd/simd.h
+++ b/lib/TH/generic/simd/simd.h
@@ -2,8 +2,10 @@
#define TH_SIMD_INC
#include <stdint.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID)
+#include <cpuid.h>
#endif
// Can be found on Intel ISA Reference for CPUID
@@ -40,6 +42,8 @@ enum SIMDExtensions
{
#if defined(__NEON__)
SIMDExtension_NEON = 0x1,
+#elif defined(__PPC64__)
+ SIMDExtension_VSX = 0x1,
#else
SIMDExtension_AVX2 = 0x1,
SIMDExtension_AVX = 0x2,
@@ -48,26 +52,57 @@ enum SIMDExtensions
SIMDExtension_DEFAULT = 0x0
};
-#if defined(__NEON__)
+
+#if defined(__arm__)
+
+ #if defined(__NEON__)
static inline uint32_t detectHostSIMDExtensions()
{
return SIMDExtension_NEON;
}
-#else // x86
+ #else //ARM without NEON
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+ return SIMDExtension_DEFAULT;
+}
+
+ #endif
+
+#elif defined(__PPC64__)
+
+ #if defined(__VSX__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+ return SIMDExtension_VSX;
+}
-#if defined(__arm__) //ARM without NEON
+ #else
static inline uint32_t detectHostSIMDExtensions()
{
return SIMDExtension_DEFAULT;
}
+
+ #endif
-#else
+#else // x86
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
-#ifndef _MSC_VER
+#if defined(_MSC_VER)
+ uint32_t cpuInfo[4];
+ __cpuid(cpuInfo, *eax);
+ *eax = cpuInfo[0];
+ *ebx = cpuInfo[1];
+ *ecx = cpuInfo[2];
+ *edx = cpuInfo[3];
+#elif defined(HAVE_GCC_GET_CPUID)
+ uint32_t level = *eax;
+ __get_cpuid (level, eax, ebx, ecx, edx);
+#else
uint32_t a = *eax, b, c, d;
asm volatile ( "cpuid\n\t"
: "+a"(a), "=b"(b), "=c"(c), "=d"(d) );
@@ -75,13 +110,6 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
*ebx = b;
*ecx = c;
*edx = d;
-#else
- uint32_t cpuInfo[4];
- __cpuid(cpuInfo, *eax);
- *eax = cpuInfo[0];
- *ebx = cpuInfo[1];
- *ecx = cpuInfo[2];
- *edx = cpuInfo[3];
#endif
}
@@ -106,7 +134,6 @@ static inline uint32_t detectHostSIMDExtensions()
return hostSimdExts;
}
-#endif // end x86 SIMD extension detection code
-#endif // end __arm__
+#endif // end SIMD extension detection code
#endif
diff --git a/lib/TH/vector/VSX.c b/lib/TH/vector/VSX.c
new file mode 100644
index 0000000..14f14a7
--- /dev/null
+++ b/lib/TH/vector/VSX.c
@@ -0,0 +1,1915 @@
+#ifdef __PPC64__
+
+#include <altivec.h>
+#include <stddef.h>
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_fill_VSX was tested on Power8:
+//
+// Unrolling 128 elements is 20% faster than unrolling 64 elements.
+// Unrolling 64 elements is faster than unrolling any lesser number of elements.
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ double val[2] = {c, c};
+ vector double fp64vec2 = vec_xl(0, val);
+
+ for (i = 0; i <= n-128; i += 128)
+ {
+ vec_xst(fp64vec2, 0, x+(i ));
+ vec_xst(fp64vec2, 0, x+(i+2 ));
+ vec_xst(fp64vec2, 0, x+(i+4 ));
+ vec_xst(fp64vec2, 0, x+(i+6 ));
+ vec_xst(fp64vec2, 0, x+(i+8 ));
+ vec_xst(fp64vec2, 0, x+(i+10 ));
+ vec_xst(fp64vec2, 0, x+(i+12 ));
+ vec_xst(fp64vec2, 0, x+(i+14 ));
+ vec_xst(fp64vec2, 0, x+(i+16 ));
+ vec_xst(fp64vec2, 0, x+(i+18 ));
+ vec_xst(fp64vec2, 0, x+(i+20 ));
+ vec_xst(fp64vec2, 0, x+(i+22 ));
+ vec_xst(fp64vec2, 0, x+(i+24 ));
+ vec_xst(fp64vec2, 0, x+(i+26 ));
+ vec_xst(fp64vec2, 0, x+(i+28 ));
+ vec_xst(fp64vec2, 0, x+(i+30 ));
+ vec_xst(fp64vec2, 0, x+(i+32 ));
+ vec_xst(fp64vec2, 0, x+(i+34 ));
+ vec_xst(fp64vec2, 0, x+(i+36 ));
+ vec_xst(fp64vec2, 0, x+(i+38 ));
+ vec_xst(fp64vec2, 0, x+(i+40 ));
+ vec_xst(fp64vec2, 0, x+(i+42 ));
+ vec_xst(fp64vec2, 0, x+(i+44 ));
+ vec_xst(fp64vec2, 0, x+(i+46 ));
+ vec_xst(fp64vec2, 0, x+(i+48 ));
+ vec_xst(fp64vec2, 0, x+(i+50 ));
+ vec_xst(fp64vec2, 0, x+(i+52 ));
+ vec_xst(fp64vec2, 0, x+(i+54 ));
+ vec_xst(fp64vec2, 0, x+(i+56 ));
+ vec_xst(fp64vec2, 0, x+(i+58 ));
+ vec_xst(fp64vec2, 0, x+(i+60 ));
+ vec_xst(fp64vec2, 0, x+(i+62 ));
+ vec_xst(fp64vec2, 0, x+(i+64 ));
+ vec_xst(fp64vec2, 0, x+(i+66 ));
+ vec_xst(fp64vec2, 0, x+(i+68 ));
+ vec_xst(fp64vec2, 0, x+(i+70 ));
+ vec_xst(fp64vec2, 0, x+(i+72 ));
+ vec_xst(fp64vec2, 0, x+(i+74 ));
+ vec_xst(fp64vec2, 0, x+(i+76 ));
+ vec_xst(fp64vec2, 0, x+(i+78 ));
+ vec_xst(fp64vec2, 0, x+(i+80 ));
+ vec_xst(fp64vec2, 0, x+(i+82 ));
+ vec_xst(fp64vec2, 0, x+(i+84 ));
+ vec_xst(fp64vec2, 0, x+(i+86 ));
+ vec_xst(fp64vec2, 0, x+(i+88 ));
+ vec_xst(fp64vec2, 0, x+(i+90 ));
+ vec_xst(fp64vec2, 0, x+(i+92 ));
+ vec_xst(fp64vec2, 0, x+(i+94 ));
+ vec_xst(fp64vec2, 0, x+(i+96 ));
+ vec_xst(fp64vec2, 0, x+(i+98 ));
+ vec_xst(fp64vec2, 0, x+(i+100));
+ vec_xst(fp64vec2, 0, x+(i+102));
+ vec_xst(fp64vec2, 0, x+(i+104));
+ vec_xst(fp64vec2, 0, x+(i+106));
+ vec_xst(fp64vec2, 0, x+(i+108));
+ vec_xst(fp64vec2, 0, x+(i+110));
+ vec_xst(fp64vec2, 0, x+(i+112));
+ vec_xst(fp64vec2, 0, x+(i+114));
+ vec_xst(fp64vec2, 0, x+(i+116));
+ vec_xst(fp64vec2, 0, x+(i+118));
+ vec_xst(fp64vec2, 0, x+(i+120));
+ vec_xst(fp64vec2, 0, x+(i+122));
+ vec_xst(fp64vec2, 0, x+(i+124));
+ vec_xst(fp64vec2, 0, x+(i+126));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ vec_xst(fp64vec2, 0, x+(i ));
+ vec_xst(fp64vec2, 0, x+(i+2 ));
+ vec_xst(fp64vec2, 0, x+(i+4 ));
+ vec_xst(fp64vec2, 0, x+(i+6 ));
+ vec_xst(fp64vec2, 0, x+(i+8 ));
+ vec_xst(fp64vec2, 0, x+(i+10 ));
+ vec_xst(fp64vec2, 0, x+(i+12 ));
+ vec_xst(fp64vec2, 0, x+(i+14 ));
+ }
+ for (; i <= n-2; i += 2)
+ vec_xst(fp64vec2, 0, x+(i ));
+ for (; i < n; i++)
+ x[i] = c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_add_VSX was tested on Power8:
+//
+// Max speedup achieved when unrolling 24 elements.
+// When unrolling 32 elements, the performance was the same as for 24.
+// When unrolling 16 elements, performance was not as good as for 24.
+// Unrolling 24 elements was 43% faster than unrolling 4 elements (2.8 sec vs 4.0 sec).
+// Unrolling 24 elements was about 8% faster than unrolling 16 elements (2.8 sec vs 3.0 sec).
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_add_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+ vector double c_fp64vec2;
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+ double val[2] = {c, c};
+ c_fp64vec2 = vec_xl(0, val);
+
+ for (i = 0; i <= n-24; i += 24)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+ y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
+ y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 );
+ y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 );
+ y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 );
+ y4_fp64vec2 = vec_madd(c_fp64vec2, x4_fp64vec2, y4_fp64vec2 );
+ y5_fp64vec2 = vec_madd(c_fp64vec2, x5_fp64vec2, y5_fp64vec2 );
+ y6_fp64vec2 = vec_madd(c_fp64vec2, x6_fp64vec2, y6_fp64vec2 );
+ y7_fp64vec2 = vec_madd(c_fp64vec2, x7_fp64vec2, y7_fp64vec2 );
+ y8_fp64vec2 = vec_madd(c_fp64vec2, x8_fp64vec2, y8_fp64vec2 );
+ y9_fp64vec2 = vec_madd(c_fp64vec2, x9_fp64vec2, y9_fp64vec2 );
+ y10_fp64vec2 = vec_madd(c_fp64vec2, x10_fp64vec2, y10_fp64vec2);
+ y11_fp64vec2 = vec_madd(c_fp64vec2, x11_fp64vec2, y11_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, y+(i+10));
+ vec_xst(y6_fp64vec2, 0, y+(i+12));
+ vec_xst(y7_fp64vec2, 0, y+(i+14));
+ vec_xst(y8_fp64vec2, 0, y+(i+16));
+ vec_xst(y9_fp64vec2, 0, y+(i+18));
+ vec_xst(y10_fp64vec2, 0, y+(i+20));
+ vec_xst(y11_fp64vec2, 0, y+(i+22));
+ }
+ for (; i <= n-8; i += 8)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+
+ y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
+ y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 );
+ y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 );
+ y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 );
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 );
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = (c * x[i]) + y[i];
+}
+
+
+static void THDoubleVector_diff_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+
+ vector double xz0_fp64vec2, xz1_fp64vec2, xz2_fp64vec2, xz3_fp64vec2, xz4_fp64vec2, xz5_fp64vec2, xz6_fp64vec2, xz7_fp64vec2;
+ vector double xz8_fp64vec2, xz9_fp64vec2, xz10_fp64vec2, xz11_fp64vec2;
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+
+ for (i = 0; i <= n-24; i += 24)
+ {
+ xz0_fp64vec2 = vec_xl(0, x+(i ));
+ xz1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ xz2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ xz3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ xz4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ xz5_fp64vec2 = vec_xl(0, x+(i+10));
+ xz6_fp64vec2 = vec_xl(0, x+(i+12));
+ xz7_fp64vec2 = vec_xl(0, x+(i+14));
+ xz8_fp64vec2 = vec_xl(0, x+(i+16));
+ xz9_fp64vec2 = vec_xl(0, x+(i+18));
+ xz10_fp64vec2 = vec_xl(0, x+(i+20));
+ xz11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+ xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
+ xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 );
+ xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 );
+ xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 );
+ xz4_fp64vec2 = vec_sub(xz4_fp64vec2, y4_fp64vec2 );
+ xz5_fp64vec2 = vec_sub(xz5_fp64vec2, y5_fp64vec2 );
+ xz6_fp64vec2 = vec_sub(xz6_fp64vec2, y6_fp64vec2 );
+ xz7_fp64vec2 = vec_sub(xz7_fp64vec2, y7_fp64vec2 );
+ xz8_fp64vec2 = vec_sub(xz8_fp64vec2, y8_fp64vec2 );
+ xz9_fp64vec2 = vec_sub(xz9_fp64vec2, y9_fp64vec2 );
+ xz10_fp64vec2 = vec_sub(xz10_fp64vec2, y10_fp64vec2);
+ xz11_fp64vec2 = vec_sub(xz11_fp64vec2, y11_fp64vec2);
+
+ vec_xst(xz0_fp64vec2, 0, z+(i ));
+ vec_xst(xz1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(xz2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(xz3_fp64vec2, 0, z+(i+6 ));
+ vec_xst(xz4_fp64vec2, 0, z+(i+8 ));
+ vec_xst(xz5_fp64vec2, 0, z+(i+10));
+ vec_xst(xz6_fp64vec2, 0, z+(i+12));
+ vec_xst(xz7_fp64vec2, 0, z+(i+14));
+ vec_xst(xz8_fp64vec2, 0, z+(i+16));
+ vec_xst(xz9_fp64vec2, 0, z+(i+18));
+ vec_xst(xz10_fp64vec2, 0, z+(i+20));
+ vec_xst(xz11_fp64vec2, 0, z+(i+22));
+ }
+ for (; i <= n-8; i += 8)
+ {
+ xz0_fp64vec2 = vec_xl(0, x+(i ));
+ xz1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ xz2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ xz3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+
+ xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
+ xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 );
+ xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 );
+ xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 );
+
+ vec_xst(xz0_fp64vec2, 0, z+(i ));
+ vec_xst(xz1_fp64vec2, 0, z+(i+2 ));
+ vec_xst(xz2_fp64vec2, 0, z+(i+4 ));
+ vec_xst(xz3_fp64vec2, 0, z+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ xz0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 );
+ vec_xst(xz0_fp64vec2, 0, z+(i ));
+ }
+ for (; i < n; i++)
+ z[i] = x[i] - y[i];
+}
+
+
+static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ vector double c_fp64vec2;
+ double val[2] = {c, c};
+ c_fp64vec2 = vec_xl(0, val);
+
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2, y12_fp64vec2, y13_fp64vec2, y14_fp64vec2, y15_fp64vec2;
+
+ for (i = 0; i <= n-32; i += 32)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+ y12_fp64vec2 = vec_xl(0, y+(i+24));
+ y13_fp64vec2 = vec_xl(0, y+(i+26));
+ y14_fp64vec2 = vec_xl(0, y+(i+28));
+ y15_fp64vec2 = vec_xl(0, y+(i+30));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2);
+ y4_fp64vec2 = vec_mul(y4_fp64vec2, c_fp64vec2);
+ y5_fp64vec2 = vec_mul(y5_fp64vec2, c_fp64vec2);
+ y6_fp64vec2 = vec_mul(y6_fp64vec2, c_fp64vec2);
+ y7_fp64vec2 = vec_mul(y7_fp64vec2, c_fp64vec2);
+ y8_fp64vec2 = vec_mul(y8_fp64vec2, c_fp64vec2);
+ y9_fp64vec2 = vec_mul(y9_fp64vec2, c_fp64vec2);
+ y10_fp64vec2 = vec_mul(y10_fp64vec2, c_fp64vec2);
+ y11_fp64vec2 = vec_mul(y11_fp64vec2, c_fp64vec2);
+ y12_fp64vec2 = vec_mul(y12_fp64vec2, c_fp64vec2);
+ y13_fp64vec2 = vec_mul(y13_fp64vec2, c_fp64vec2);
+ y14_fp64vec2 = vec_mul(y14_fp64vec2, c_fp64vec2);
+ y15_fp64vec2 = vec_mul(y15_fp64vec2, c_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, y+(i+10));
+ vec_xst(y6_fp64vec2, 0, y+(i+12));
+ vec_xst(y7_fp64vec2, 0, y+(i+14));
+ vec_xst(y8_fp64vec2, 0, y+(i+16));
+ vec_xst(y9_fp64vec2, 0, y+(i+18));
+ vec_xst(y10_fp64vec2, 0, y+(i+20));
+ vec_xst(y11_fp64vec2, 0, y+(i+22));
+ vec_xst(y12_fp64vec2, 0, y+(i+24));
+ vec_xst(y13_fp64vec2, 0, y+(i+26));
+ vec_xst(y14_fp64vec2, 0, y+(i+28));
+ vec_xst(y15_fp64vec2, 0, y+(i+30));
+ }
+ for (; i <= n-8; i += 8)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = y[i] * c;
+}
+
+
+static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+ vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+ vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+ vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+ for (i = 0; i <= n-24; i += 24)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+ y4_fp64vec2 = vec_xl(0, y+(i+8 ));
+ y5_fp64vec2 = vec_xl(0, y+(i+10));
+ y6_fp64vec2 = vec_xl(0, y+(i+12));
+ y7_fp64vec2 = vec_xl(0, y+(i+14));
+ y8_fp64vec2 = vec_xl(0, y+(i+16));
+ y9_fp64vec2 = vec_xl(0, y+(i+18));
+ y10_fp64vec2 = vec_xl(0, y+(i+20));
+ y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+ x4_fp64vec2 = vec_xl(0, x+(i+8 ));
+ x5_fp64vec2 = vec_xl(0, x+(i+10));
+ x6_fp64vec2 = vec_xl(0, x+(i+12));
+ x7_fp64vec2 = vec_xl(0, x+(i+14));
+ x8_fp64vec2 = vec_xl(0, x+(i+16));
+ x9_fp64vec2 = vec_xl(0, x+(i+18));
+ x10_fp64vec2 = vec_xl(0, x+(i+20));
+ x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+ y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2);
+ y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2);
+ y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2);
+ y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2);
+ y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2);
+ y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2);
+ y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
+ y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ vec_xst(y4_fp64vec2, 0, y+(i+8 ));
+ vec_xst(y5_fp64vec2, 0, y+(i+10));
+ vec_xst(y6_fp64vec2, 0, y+(i+12));
+ vec_xst(y7_fp64vec2, 0, y+(i+14));
+ vec_xst(y8_fp64vec2, 0, y+(i+16));
+ vec_xst(y9_fp64vec2, 0, y+(i+18));
+ vec_xst(y10_fp64vec2, 0, y+(i+20));
+ vec_xst(y11_fp64vec2, 0, y+(i+22));
+ }
+ for (; i <= n-8; i += 8)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ y1_fp64vec2 = vec_xl(0, y+(i+2 ));
+ y2_fp64vec2 = vec_xl(0, y+(i+4 ));
+ y3_fp64vec2 = vec_xl(0, y+(i+6 ));
+
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ x1_fp64vec2 = vec_xl(0, x+(i+2 ));
+ x2_fp64vec2 = vec_xl(0, x+(i+4 ));
+ x3_fp64vec2 = vec_xl(0, x+(i+6 ));
+
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
+ y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
+ y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
+
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ vec_xst(y1_fp64vec2, 0, y+(i+2 ));
+ vec_xst(y2_fp64vec2, 0, y+(i+4 ));
+ vec_xst(y3_fp64vec2, 0, y+(i+6 ));
+ }
+ for (; i <= n-2; i += 2)
+ {
+ y0_fp64vec2 = vec_xl(0, y+(i ));
+ x0_fp64vec2 = vec_xl(0, x+(i ));
+ y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
+ vec_xst(y0_fp64vec2, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = y[i] * x[i];
+}
+
+
+
+
+
+
+
+static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ float val[4] = {c, c, c, c};
+ vector float fp32vec4 = vec_xl(0, val);
+
+ for (i = 0; i <= n-256; i += 256)
+ {
+ vec_xst(fp32vec4, 0, x+(i ));
+ vec_xst(fp32vec4, 0, x+(i+4 ));
+ vec_xst(fp32vec4, 0, x+(i+8 ));
+ vec_xst(fp32vec4, 0, x+(i+12 ));
+ vec_xst(fp32vec4, 0, x+(i+16 ));
+ vec_xst(fp32vec4, 0, x+(i+20 ));
+ vec_xst(fp32vec4, 0, x+(i+24 ));
+ vec_xst(fp32vec4, 0, x+(i+28 ));
+ vec_xst(fp32vec4, 0, x+(i+32 ));
+ vec_xst(fp32vec4, 0, x+(i+36 ));
+ vec_xst(fp32vec4, 0, x+(i+40 ));
+ vec_xst(fp32vec4, 0, x+(i+44 ));
+ vec_xst(fp32vec4, 0, x+(i+48 ));
+ vec_xst(fp32vec4, 0, x+(i+52 ));
+ vec_xst(fp32vec4, 0, x+(i+56 ));
+ vec_xst(fp32vec4, 0, x+(i+60 ));
+ vec_xst(fp32vec4, 0, x+(i+64 ));
+ vec_xst(fp32vec4, 0, x+(i+68 ));
+ vec_xst(fp32vec4, 0, x+(i+72 ));
+ vec_xst(fp32vec4, 0, x+(i+76 ));
+ vec_xst(fp32vec4, 0, x+(i+80 ));
+ vec_xst(fp32vec4, 0, x+(i+84 ));
+ vec_xst(fp32vec4, 0, x+(i+88 ));
+ vec_xst(fp32vec4, 0, x+(i+92 ));
+ vec_xst(fp32vec4, 0, x+(i+96 ));
+ vec_xst(fp32vec4, 0, x+(i+100));
+ vec_xst(fp32vec4, 0, x+(i+104));
+ vec_xst(fp32vec4, 0, x+(i+108));
+ vec_xst(fp32vec4, 0, x+(i+112));
+ vec_xst(fp32vec4, 0, x+(i+116));
+ vec_xst(fp32vec4, 0, x+(i+120));
+ vec_xst(fp32vec4, 0, x+(i+124));
+ vec_xst(fp32vec4, 0, x+(i+128));
+ vec_xst(fp32vec4, 0, x+(i+132));
+ vec_xst(fp32vec4, 0, x+(i+136));
+ vec_xst(fp32vec4, 0, x+(i+140));
+ vec_xst(fp32vec4, 0, x+(i+144));
+ vec_xst(fp32vec4, 0, x+(i+148));
+ vec_xst(fp32vec4, 0, x+(i+152));
+ vec_xst(fp32vec4, 0, x+(i+156));
+ vec_xst(fp32vec4, 0, x+(i+160));
+ vec_xst(fp32vec4, 0, x+(i+164));
+ vec_xst(fp32vec4, 0, x+(i+168));
+ vec_xst(fp32vec4, 0, x+(i+172));
+ vec_xst(fp32vec4, 0, x+(i+176));
+ vec_xst(fp32vec4, 0, x+(i+180));
+ vec_xst(fp32vec4, 0, x+(i+184));
+ vec_xst(fp32vec4, 0, x+(i+188));
+ vec_xst(fp32vec4, 0, x+(i+192));
+ vec_xst(fp32vec4, 0, x+(i+196));
+ vec_xst(fp32vec4, 0, x+(i+200));
+ vec_xst(fp32vec4, 0, x+(i+204));
+ vec_xst(fp32vec4, 0, x+(i+208));
+ vec_xst(fp32vec4, 0, x+(i+212));
+ vec_xst(fp32vec4, 0, x+(i+216));
+ vec_xst(fp32vec4, 0, x+(i+220));
+ vec_xst(fp32vec4, 0, x+(i+224));
+ vec_xst(fp32vec4, 0, x+(i+228));
+ vec_xst(fp32vec4, 0, x+(i+232));
+ vec_xst(fp32vec4, 0, x+(i+236));
+ vec_xst(fp32vec4, 0, x+(i+240));
+ vec_xst(fp32vec4, 0, x+(i+244));
+ vec_xst(fp32vec4, 0, x+(i+248));
+ vec_xst(fp32vec4, 0, x+(i+252));
+ }
+ for (; i <= n-32; i += 32)
+ {
+ vec_xst(fp32vec4, 0, x+(i ));
+ vec_xst(fp32vec4, 0, x+(i+4 ));
+ vec_xst(fp32vec4, 0, x+(i+8 ));
+ vec_xst(fp32vec4, 0, x+(i+12 ));
+ vec_xst(fp32vec4, 0, x+(i+16 ));
+ vec_xst(fp32vec4, 0, x+(i+20 ));
+ vec_xst(fp32vec4, 0, x+(i+24 ));
+ vec_xst(fp32vec4, 0, x+(i+28 ));
+ }
+ for (; i <= n-4; i += 4)
+ vec_xst(fp32vec4, 0, x+(i ));
+ for (; i < n; i++)
+ x[i] = c;
+}
+
+
+static void THFloatVector_add_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+ vector float c_fp32vec4;
+ vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+ vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+ vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+ float val[4] = {c, c, c, c};
+ c_fp32vec4 = vec_xl(0, val);
+
+ for (i = 0; i <= n-48; i += 48)
+ {
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+ x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x5_fp32vec4 = vec_xl(0, x+(i+20));
+ x6_fp32vec4 = vec_xl(0, x+(i+24));
+ x7_fp32vec4 = vec_xl(0, x+(i+28));
+ x8_fp32vec4 = vec_xl(0, x+(i+32));
+ x9_fp32vec4 = vec_xl(0, x+(i+36));
+ x10_fp32vec4 = vec_xl(0, x+(i+40));
+ x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y4_fp32vec4 = vec_xl(0, y+(i+16));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+ y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
+ y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 );
+ y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 );
+ y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 );
+ y4_fp32vec4 = vec_madd(c_fp32vec4, x4_fp32vec4, y4_fp32vec4 );
+ y5_fp32vec4 = vec_madd(c_fp32vec4, x5_fp32vec4, y5_fp32vec4 );
+ y6_fp32vec4 = vec_madd(c_fp32vec4, x6_fp32vec4, y6_fp32vec4 );
+ y7_fp32vec4 = vec_madd(c_fp32vec4, x7_fp32vec4, y7_fp32vec4 );
+ y8_fp32vec4 = vec_madd(c_fp32vec4, x8_fp32vec4, y8_fp32vec4 );
+ y9_fp32vec4 = vec_madd(c_fp32vec4, x9_fp32vec4, y9_fp32vec4 );
+ y10_fp32vec4 = vec_madd(c_fp32vec4, x10_fp32vec4, y10_fp32vec4);
+ y11_fp32vec4 = vec_madd(c_fp32vec4, x11_fp32vec4, y11_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ vec_xst(y4_fp32vec4, 0, y+(i+16));
+ vec_xst(y5_fp32vec4, 0, y+(i+20));
+ vec_xst(y6_fp32vec4, 0, y+(i+24));
+ vec_xst(y7_fp32vec4, 0, y+(i+28));
+ vec_xst(y8_fp32vec4, 0, y+(i+32));
+ vec_xst(y9_fp32vec4, 0, y+(i+36));
+ vec_xst(y10_fp32vec4, 0, y+(i+40));
+ vec_xst(y11_fp32vec4, 0, y+(i+44));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+
+ y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
+ y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 );
+ y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 );
+ y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 );
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 );
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = (c * x[i]) + y[i];
+}
+
+
+
+
+static void THFloatVector_diff_VSX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+ ptrdiff_t i;
+
+ vector float xz0_fp32vec4, xz1_fp32vec4, xz2_fp32vec4, xz3_fp32vec4, xz4_fp32vec4, xz5_fp32vec4, xz6_fp32vec4, xz7_fp32vec4;
+ vector float xz8_fp32vec4, xz9_fp32vec4, xz10_fp32vec4, xz11_fp32vec4;
+ vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+
+ for (i = 0; i <= n-48; i += 48)
+ {
+ xz0_fp32vec4 = vec_xl(0, x+(i ));
+ xz1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ xz2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ xz3_fp32vec4 = vec_xl(0, x+(i+12));
+ xz4_fp32vec4 = vec_xl(0, x+(i+16));
+ xz5_fp32vec4 = vec_xl(0, x+(i+20));
+ xz6_fp32vec4 = vec_xl(0, x+(i+24));
+ xz7_fp32vec4 = vec_xl(0, x+(i+28));
+ xz8_fp32vec4 = vec_xl(0, x+(i+32));
+ xz9_fp32vec4 = vec_xl(0, x+(i+36));
+ xz10_fp32vec4 = vec_xl(0, x+(i+40));
+ xz11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y4_fp32vec4 = vec_xl(0, y+(i+16));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+ xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
+ xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 );
+ xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 );
+ xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 );
+ xz4_fp32vec4 = vec_sub(xz4_fp32vec4, y4_fp32vec4 );
+ xz5_fp32vec4 = vec_sub(xz5_fp32vec4, y5_fp32vec4 );
+ xz6_fp32vec4 = vec_sub(xz6_fp32vec4, y6_fp32vec4 );
+ xz7_fp32vec4 = vec_sub(xz7_fp32vec4, y7_fp32vec4 );
+ xz8_fp32vec4 = vec_sub(xz8_fp32vec4, y8_fp32vec4 );
+ xz9_fp32vec4 = vec_sub(xz9_fp32vec4, y9_fp32vec4 );
+ xz10_fp32vec4 = vec_sub(xz10_fp32vec4, y10_fp32vec4);
+ xz11_fp32vec4 = vec_sub(xz11_fp32vec4, y11_fp32vec4);
+
+ vec_xst(xz0_fp32vec4, 0, z+(i ));
+ vec_xst(xz1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(xz2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(xz3_fp32vec4, 0, z+(i+12));
+ vec_xst(xz4_fp32vec4, 0, z+(i+16));
+ vec_xst(xz5_fp32vec4, 0, z+(i+20));
+ vec_xst(xz6_fp32vec4, 0, z+(i+24));
+ vec_xst(xz7_fp32vec4, 0, z+(i+28));
+ vec_xst(xz8_fp32vec4, 0, z+(i+32));
+ vec_xst(xz9_fp32vec4, 0, z+(i+36));
+ vec_xst(xz10_fp32vec4, 0, z+(i+40));
+ vec_xst(xz11_fp32vec4, 0, z+(i+44));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ xz0_fp32vec4 = vec_xl(0, x+(i ));
+ xz1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ xz2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ xz3_fp32vec4 = vec_xl(0, x+(i+12));
+
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+
+ xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
+ xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 );
+ xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 );
+ xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 );
+
+ vec_xst(xz0_fp32vec4, 0, z+(i ));
+ vec_xst(xz1_fp32vec4, 0, z+(i+4 ));
+ vec_xst(xz2_fp32vec4, 0, z+(i+8 ));
+ vec_xst(xz3_fp32vec4, 0, z+(i+12));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ xz0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 );
+ vec_xst(xz0_fp32vec4, 0, z+(i ));
+ }
+ for (; i < n; i++)
+ z[i] = x[i] - y[i];
+}
+
+
+static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ vector float c_fp32vec4;
+ float val[4] = {c, c, c, c};
+ c_fp32vec4 = vec_xl(0, val);
+
+ vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4, y12_fp32vec4, y13_fp32vec4, y14_fp32vec4, y15_fp32vec4;
+
+ for (i = 0; i <= n-64; i += 64)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y4_fp32vec4 = vec_xl(0, y+(i+16));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+ y12_fp32vec4 = vec_xl(0, y+(i+48));
+ y13_fp32vec4 = vec_xl(0, y+(i+52));
+ y14_fp32vec4 = vec_xl(0, y+(i+56));
+ y15_fp32vec4 = vec_xl(0, y+(i+60));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4);
+ y4_fp32vec4 = vec_mul(y4_fp32vec4, c_fp32vec4);
+ y5_fp32vec4 = vec_mul(y5_fp32vec4, c_fp32vec4);
+ y6_fp32vec4 = vec_mul(y6_fp32vec4, c_fp32vec4);
+ y7_fp32vec4 = vec_mul(y7_fp32vec4, c_fp32vec4);
+ y8_fp32vec4 = vec_mul(y8_fp32vec4, c_fp32vec4);
+ y9_fp32vec4 = vec_mul(y9_fp32vec4, c_fp32vec4);
+ y10_fp32vec4 = vec_mul(y10_fp32vec4, c_fp32vec4);
+ y11_fp32vec4 = vec_mul(y11_fp32vec4, c_fp32vec4);
+ y12_fp32vec4 = vec_mul(y12_fp32vec4, c_fp32vec4);
+ y13_fp32vec4 = vec_mul(y13_fp32vec4, c_fp32vec4);
+ y14_fp32vec4 = vec_mul(y14_fp32vec4, c_fp32vec4);
+ y15_fp32vec4 = vec_mul(y15_fp32vec4, c_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ vec_xst(y4_fp32vec4, 0, y+(i+16));
+ vec_xst(y5_fp32vec4, 0, y+(i+20));
+ vec_xst(y6_fp32vec4, 0, y+(i+24));
+ vec_xst(y7_fp32vec4, 0, y+(i+28));
+ vec_xst(y8_fp32vec4, 0, y+(i+32));
+ vec_xst(y9_fp32vec4, 0, y+(i+36));
+ vec_xst(y10_fp32vec4, 0, y+(i+40));
+ vec_xst(y11_fp32vec4, 0, y+(i+44));
+ vec_xst(y12_fp32vec4, 0, y+(i+48));
+ vec_xst(y13_fp32vec4, 0, y+(i+52));
+ vec_xst(y14_fp32vec4, 0, y+(i+56));
+ vec_xst(y15_fp32vec4, 0, y+(i+60));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4);
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = y[i] * c;
+}
+
+
+
+static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
+{
+ ptrdiff_t i;
+
+ vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+ vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+ vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+ vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+ for (i = 0; i <= n-48; i += 48)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+ y4_fp32vec4 = vec_xl(0, y+(i+16));
+ y5_fp32vec4 = vec_xl(0, y+(i+20));
+ y6_fp32vec4 = vec_xl(0, y+(i+24));
+ y7_fp32vec4 = vec_xl(0, y+(i+28));
+ y8_fp32vec4 = vec_xl(0, y+(i+32));
+ y9_fp32vec4 = vec_xl(0, y+(i+36));
+ y10_fp32vec4 = vec_xl(0, y+(i+40));
+ y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+ x4_fp32vec4 = vec_xl(0, x+(i+16));
+ x5_fp32vec4 = vec_xl(0, x+(i+20));
+ x6_fp32vec4 = vec_xl(0, x+(i+24));
+ x7_fp32vec4 = vec_xl(0, x+(i+28));
+ x8_fp32vec4 = vec_xl(0, x+(i+32));
+ x9_fp32vec4 = vec_xl(0, x+(i+36));
+ x10_fp32vec4 = vec_xl(0, x+(i+40));
+ x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+ y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4);
+ y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4);
+ y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4);
+ y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4);
+ y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4);
+ y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4);
+ y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
+ y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ vec_xst(y4_fp32vec4, 0, y+(i+16));
+ vec_xst(y5_fp32vec4, 0, y+(i+20));
+ vec_xst(y6_fp32vec4, 0, y+(i+24));
+ vec_xst(y7_fp32vec4, 0, y+(i+28));
+ vec_xst(y8_fp32vec4, 0, y+(i+32));
+ vec_xst(y9_fp32vec4, 0, y+(i+36));
+ vec_xst(y10_fp32vec4, 0, y+(i+40));
+ vec_xst(y11_fp32vec4, 0, y+(i+44));
+ }
+ for (; i <= n-16; i += 16)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ y1_fp32vec4 = vec_xl(0, y+(i+4 ));
+ y2_fp32vec4 = vec_xl(0, y+(i+8 ));
+ y3_fp32vec4 = vec_xl(0, y+(i+12));
+
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ x1_fp32vec4 = vec_xl(0, x+(i+4 ));
+ x2_fp32vec4 = vec_xl(0, x+(i+8 ));
+ x3_fp32vec4 = vec_xl(0, x+(i+12));
+
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
+ y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
+ y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
+
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ vec_xst(y1_fp32vec4, 0, y+(i+4 ));
+ vec_xst(y2_fp32vec4, 0, y+(i+8 ));
+ vec_xst(y3_fp32vec4, 0, y+(i+12));
+ }
+ for (; i <= n-4; i += 4)
+ {
+ y0_fp32vec4 = vec_xl(0, y+(i ));
+ x0_fp32vec4 = vec_xl(0, x+(i ));
+ y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
+ vec_xst(y0_fp32vec4, 0, y+(i ));
+ }
+ for (; i < n; i++)
+ y[i] = y[i] * x[i];
+}
+
+
+
+
+
+//------------------------------------------------
+//
+// Testing for correctness and performance
+//
+// If you want to run these tests, compile this
+// file with -DRUN_VSX_TESTS on a Power machine,
+// and then run the executable that is generated.
+//
+//------------------------------------------------
+//
+// Example passing run (from a Power8 machine):
+//
+// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
+// $ ./vsxtest
+//
+// standardDouble_fill() test took 0.34604 seconds
+// THDoubleVector_fill_VSX() test took 0.15663 seconds
+// All assertions PASSED for THDoubleVector_fill_VSX() test.
+//
+// standardFloat_fill() test took 0.32901 seconds
+// THFloatVector_fill_VSX() test took 0.07830 seconds
+// All assertions PASSED for THFloatVector_fill_VSX() test.
+//
+// standardDouble_add() test took 0.51602 seconds
+// THDoubleVector_add_VSX() test took 0.31384 seconds
+// All assertions PASSED for THDoubleVector_add_VSX() test.
+//
+// standardFloat_add() test took 0.39845 seconds
+// THFloatVector_add_VSX() test took 0.14544 seconds
+// All assertions PASSED for THFloatVector_add_VSX() test.
+//
+// standardDouble_diff() test took 0.48219 seconds
+// THDoubleVector_diff_VSX() test took 0.31708 seconds
+// All assertions PASSED for THDoubleVector_diff_VSX() test.
+//
+// standardFloat_diff() test took 0.60340 seconds
+// THFloatVector_diff_VSX() test took 0.17083 seconds
+// All assertions PASSED for THFloatVector_diff_VSX() test.
+//
+// standardDouble_scale() test took 0.33157 seconds
+// THDoubleVector_scale_VSX() test took 0.19075 seconds
+// All assertions PASSED for THDoubleVector_scale_VSX() test.
+//
+// standardFloat_scale() test took 0.33008 seconds
+// THFloatVector_scale_VSX() test took 0.09741 seconds
+// All assertions PASSED for THFloatVector_scale_VSX() test.
+//
+// standardDouble_mul() test took 0.50986 seconds
+// THDoubleVector_mul_VSX() test took 0.30939 seconds
+// All assertions PASSED for THDoubleVector_mul_VSX() test.
+//
+// standardFloat_mul() test took 0.40241 seconds
+// THFloatVector_mul_VSX() test took 0.14346 seconds
+// All assertions PASSED for THFloatVector_mul_VSX() test.
+//
+// Finished runnning all tests. All tests PASSED.
+//
+//------------------------------------------------
+#ifdef RUN_VSX_TESTS
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <math.h>
+
+#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
+#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
+
+void test_THDoubleVector_fill_VSX();
+
+static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ x[i] = c;
+}
+
+static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ x[i] = c;
+}
+
+static void standardDouble_add(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] += c * x[i];
+}
+
+static void standardFloat_add(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] += c * x[i];
+}
+
+static void standardDouble_diff(double *z, const double *x, const double *y, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ z[i] = x[i] - y[i];
+}
+
+static void standardFloat_diff(float *z, const float *x, const float *y, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ z[i] = x[i] - y[i];
+}
+
+static void standardDouble_scale(double *y, const double c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] *= c;
+}
+
+static void standardFloat_scale(float *y, const float c, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] *= c;
+}
+
+static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] *= x[i];
+}
+
+static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n)
+{
+ for (ptrdiff_t i = 0; i < n; i++)
+ y[i] *= x[i];
+}
+
+double randDouble()
+{
+ return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0);
+}
+
+int near(double a, double b)
+{
+ int aClass = fpclassify(a);
+ int bClass = fpclassify(b);
+
+ if(aClass != bClass) // i.e. is it NAN, infinite, or finite...?
+ return 0;
+
+ if(aClass == FP_INFINITE) // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity
+ return (signbit(a) == signbit(b));
+ else if(aClass == FP_NORMAL) // if it is a normal number then check the magnitude of the difference between the numbers
+ return fabs(a - b) < 0.001;
+ else // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other.
+ return 1;
+}
+
+void test_THDoubleVector_fill_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *x_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+ double yVal0 = 17.2;
+ double yVal1 = 8.2;
+ double yVal2 = 5.1;
+ double yVal3 = -0.9;
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ yVal0 += 1.0;
+ yVal1 += 1.0;
+ yVal2 += 1.0;
+ yVal3 -= 1.0;
+
+ standardDouble_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+ THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ assert(x_optimized[i] == yVal0);
+
+ standardDouble_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ assert(x_optimized[i] == x_standard[i]);
+ printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n");
+
+
+ free(x_standard);
+ free(x_optimized);
+}
+
+
+void test_THFloatVector_fill_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *x_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+ float yVal0 = 17.2;
+ float yVal1 = 8.2;
+ float yVal2 = 5.1;
+ float yVal3 = -0.9;
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ yVal0 += 1.0;
+ yVal1 += 1.0;
+ yVal2 += 1.0;
+ yVal3 -= 1.0;
+
+ standardFloat_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+ THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ assert(x_optimized[i] == yVal0);
+
+ standardFloat_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ assert(x_optimized[i] == x_standard[i]);
+ printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n");
+
+
+ free(x_standard);
+ free(x_optimized);
+}
+
+void test_THDoubleVector_add_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = (double)randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = randDouble();
+ double yVal = randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_add_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+ free(x);
+}
+
+
+void test_THFloatVector_add_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = (float)randDouble();
+ float yVal = (float)randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THFloatVector_add_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+ free(x);
+}
+
+void test_THDoubleVector_diff_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = randDouble();
+ y[i] = randDouble();
+ double zVal = randDouble();
+ z_standard[i] = zVal;
+ z_optimized[i] = zVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_diff_VSX() test.\n\n");
+
+
+ free(z_standard);
+ free(z_optimized);
+ free(y);
+ free(x);
+}
+
+
+void test_THFloatVector_diff_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = (float)randDouble();
+ y[i] = (float)randDouble();
+ float zVal = (float)randDouble();
+ z_standard[i] = zVal;
+ z_optimized[i] = zVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(z_optimized[i], z_standard[i]))
+ printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+ assert(near(z_optimized[i], z_standard[i]));
+ }
+ printf("All assertions PASSED for THFloatVector_diff_VSX() test.\n\n");
+
+
+ free(z_standard);
+ free(z_optimized);
+ free(y);
+ free(x);
+}
+
+
+void test_THDoubleVector_scale_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double c = randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ double yVal = randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_scale_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+}
+
+
+void test_THFloatVector_scale_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float c = (float)randDouble();
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ float yVal = (float)randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THFloatVector_scale_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+}
+
+void test_THDoubleVector_mul_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+ double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = randDouble();
+ double yVal = randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardDouble_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THDoubleVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardDouble_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THDoubleVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardDouble_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THDoubleVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardDouble_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THDoubleVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardDouble_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THDoubleVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardDouble_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THDoubleVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THDoubleVector_mul_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+ free(x);
+}
+
+
+void test_THFloatVector_mul_VSX()
+{
+ clock_t start, end;
+ double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+ float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+ float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+ // Initialize randomly
+ for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+ {
+ x[i] = (float)randDouble();
+ float yVal = (float)randDouble();
+ y_standard[i] = yVal;
+ y_optimized[i] = yVal;
+ }
+
+
+ //-------------------------------------------------
+ // Performance Test
+ //-------------------------------------------------
+ start = clock();
+ standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("standardFloat_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+ start = clock();
+ THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS );
+ THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+ THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+ THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+ end = clock();
+
+ elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+ printf("THFloatVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+ //-------------------------------------------------
+ // Correctness Test
+ //-------------------------------------------------
+ standardFloat_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ THFloatVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+ standardFloat_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ THFloatVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+ standardFloat_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ THFloatVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+ standardFloat_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ THFloatVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+ int r = rand() % 258;
+ standardFloat_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ THFloatVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+ for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+ {
+ if(!near(y_optimized[i], y_standard[i]))
+ printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+ assert(near(y_optimized[i], y_standard[i]));
+ }
+ printf("All assertions PASSED for THFloatVector_mul_VSX() test.\n\n");
+
+
+ free(y_standard);
+ free(y_optimized);
+ free(x);
+}
+
+
+
+int main()
+{
+ printf("\n");
+
+
+ // First test utility functions
+
+ assert(!near(0.1, -0.1));
+ assert(!near(0.1f, -0.1f));
+ assert(!near(9, 10));
+ assert(near(0.1, 0.1000001));
+ assert(near(0.1f, 0.1000001f));
+ assert(near(100.764, 100.764));
+ assert(!near(NAN, 0.0));
+ assert(!near(-9.5, NAN));
+ assert(!near(NAN, 100));
+ assert(!near(-0.0, NAN));
+ assert(near(NAN, NAN));
+ assert(near(INFINITY, INFINITY));
+ assert(near(-INFINITY, -INFINITY));
+ assert(!near(INFINITY, NAN));
+ assert(!near(0, INFINITY));
+ assert(!near(-999.4324, INFINITY));
+ assert(!near(INFINITY, 982374.1));
+ assert(!near(-INFINITY, INFINITY));
+
+
+
+ // Then test each vectorized function
+
+ test_THDoubleVector_fill_VSX();
+ test_THFloatVector_fill_VSX();
+
+ test_THDoubleVector_add_VSX();
+ test_THFloatVector_add_VSX();
+
+ test_THDoubleVector_diff_VSX();
+ test_THFloatVector_diff_VSX();
+
+ test_THDoubleVector_scale_VSX();
+ test_THFloatVector_scale_VSX();
+
+ test_THDoubleVector_mul_VSX();
+ test_THFloatVector_mul_VSX();
+
+
+ printf("Finished runnning all tests. All tests PASSED.\n");
+ return 0;
+}
+
+
+#endif // defined RUN_VSX_TESTS
+
+#endif // defined __PPC64__
+
diff --git a/test/test.lua b/test/test.lua
index eb7cf0a..e7e26e4 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -19,6 +19,35 @@ local function maxdiff(x,y)
end
end
+-- workarounds for non-existant functions
+function torch.HalfTensor:__sub(other)
+ return (self:real() - other:real()):half()
+end
+
+function torch.HalfTensor:mean(dim)
+ return self:real():mean(dim):half()
+end
+
+function torch.HalfTensor:abs()
+ return self:real():abs():half()
+end
+
+function torch.HalfTensor:max()
+ return self:real():max()
+end
+
+function torch.HalfTensor:add(a, b)
+ return (self:real():add(a, b:real())):half()
+end
+
+function torch.HalfTensor:reshape(a, b)
+ return (self:real():reshape(a, b)):half()
+end
+
+function torch.HalfTensor:fill(a)
+ return self:real():fill(a):half()
+end
+
function torchtest.dot()
local types = {
['torch.DoubleTensor'] = 1e-8, -- for ddot
@@ -1355,6 +1384,18 @@ function torchtest.histc()
local z = torch.Tensor{ 0, 3, 0, 2, 1 }
mytester:assertTensorEq(y,z,precision,'error in torch.histc')
end
+function torchtest.bhistc()
+ local x = torch.Tensor(3, 6)
+ x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
+ x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 }
+ x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 }
+ local y = torch.bhistc(x, 5, 1, 5) -- nbins, min, max
+ local z = torch.Tensor(3, 5)
+ z[1] = torch.Tensor{ 0, 3, 0, 2, 1 }
+ z[2] = torch.Tensor{ 1, 0, 2, 0, 3 }
+ z[3] = torch.Tensor{ 1, 1, 1, 1, 2 }
+ mytester:assertTensorEq(y,z,precision,'error in torch.bhistc in last dimension')
+end
function torchtest.ones()
local mx = torch.ones(msize,msize)
local mxx = torch.Tensor()
@@ -3103,7 +3144,13 @@ function torchtest.isTypeOfPattern()
end
function torchtest.isTensor()
- local t = torch.randn(3,4)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_isTensor(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_isTensor(func)
+ local t = func(torch.randn(3,4))
mytester:assert(torch.isTensor(t), 'error in isTensor')
mytester:assert(torch.isTensor(t[1]), 'error in isTensor for subTensor')
mytester:assert(not torch.isTensor(t[1][2]), 'false positive in isTensor')
@@ -3111,14 +3158,26 @@ function torchtest.isTensor()
end
function torchtest.isStorage()
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_isStorage(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_isStorage(func)
local t = torch.randn(3,4)
mytester:assert(torch.isStorage(t:storage()), 'error in isStorage')
mytester:assert(not torch.isStorage(t), 'false positive in isStorage')
end
function torchtest.view()
- local tensor = torch.rand(15)
- local template = torch.rand(3,5)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_view(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_view(func)
+ local tensor = func(torch.rand(15))
+ local template = func(torch.rand(3,5))
local target = template:size():totable()
mytester:assertTableEq(tensor:viewAs(template):size():totable(), target, 'Error in viewAs')
mytester:assertTableEq(tensor:view(3,5):size():totable(), target, 'Error in view')
@@ -3129,7 +3188,7 @@ function torchtest.view()
tensor_view:fill(torch.rand(1)[1])
mytester:asserteq((tensor_view-tensor):abs():max(), 0, 'Error in view')
- local target_tensor = torch.Tensor()
+ local target_tensor = func(torch.Tensor())
mytester:assertTableEq(target_tensor:viewAs(tensor, template):size():totable(), target, 'Error in viewAs')
mytester:assertTableEq(target_tensor:view(tensor, 3,5):size():totable(), target, 'Error in view')
mytester:assertTableEq(target_tensor:view(tensor, torch.LongStorage{3,5}):size():totable(), target, 'Error in view using LongStorage')
@@ -3140,9 +3199,15 @@ function torchtest.view()
end
function torchtest.expand()
- local result = torch.Tensor()
- local tensor = torch.rand(8,1)
- local template = torch.rand(8,5)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_expand(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_expand(func)
+ local result = func(torch.Tensor())
+ local tensor = func(torch.rand(8,1))
+ local template = func(torch.rand(8,5))
local target = template:size():totable()
mytester:assertTableEq(tensor:expandAs(template):size():totable(), target, 'Error in expandAs')
mytester:assertTableEq(tensor:expand(8,5):size():totable(), target, 'Error in expand')
@@ -3157,8 +3222,14 @@ function torchtest.expand()
end
function torchtest.repeatTensor()
- local result = torch.Tensor()
- local tensor = torch.rand(8,4)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_repeatTensor(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_repeatTensor(func, mean)
+ local result = func(torch.Tensor())
+ local tensor = func(torch.rand(8,4))
local size = {3,1,1}
local sizeStorage = torch.LongStorage(size)
local target = {3,8,4}
@@ -3172,10 +3243,16 @@ function torchtest.repeatTensor()
end
function torchtest.isSameSizeAs()
- local t1 = torch.Tensor(3, 4, 9, 10)
- local t2 = torch.Tensor(3, 4)
- local t3 = torch.Tensor(1, 9, 3, 3)
- local t4 = torch.Tensor(3, 4, 9, 10)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_isSameSizeAs(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_isSameSizeAs(func)
+ local t1 = func(torch.Tensor(3, 4, 9, 10))
+ local t2 = func(torch.Tensor(3, 4))
+ local t3 = func(torch.Tensor(1, 9, 3, 3))
+ local t4 = func(torch.Tensor(3, 4, 9, 10))
mytester:assert(t1:isSameSizeAs(t2) == false, "wrong answer ")
mytester:assert(t1:isSameSizeAs(t3) == false, "wrong answer ")
@@ -3183,15 +3260,21 @@ function torchtest.isSameSizeAs()
end
function torchtest.isSetTo()
- local t1 = torch.Tensor(3, 4, 9, 10)
- local t2 = torch.Tensor(3, 4, 9, 10)
- local t3 = torch.Tensor():set(t1)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_isSetTo(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_isSetTo(func)
+ local t1 = func(torch.Tensor(3, 4, 9, 10))
+ local t2 = func(torch.Tensor(3, 4, 9, 10))
+ local t3 = func(torch.Tensor()):set(t1)
local t4 = t3:reshape(12, 90)
mytester:assert(t1:isSetTo(t2) == false, "tensors do not share storage")
mytester:assert(t1:isSetTo(t3) == true, "tensor is set to other")
mytester:assert(t3:isSetTo(t1) == true, "isSetTo should be symmetric")
mytester:assert(t1:isSetTo(t4) == false, "tensors have different view")
- mytester:assert(not torch.Tensor():isSetTo(torch.Tensor()),
+ mytester:assert(not func(torch.Tensor()):isSetTo(func(torch.Tensor())),
"Tensors with no storages should not appear to be set " ..
"to each other")
end
@@ -3229,7 +3312,13 @@ function torchtest.equal()
end
function torchtest.isSize()
- local t1 = torch.Tensor(3, 4, 5)
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_isSize(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_isSize(func)
+ local t1 = func(torch.Tensor(3, 4, 5))
local s1 = torch.LongStorage({3, 4, 5})
local s2 = torch.LongStorage({5, 4, 3})
@@ -3246,6 +3335,7 @@ function torchtest.elementSize()
local long = torch.LongStorage():elementSize()
local float = torch.FloatStorage():elementSize()
local double = torch.DoubleStorage():elementSize()
+ local half = torch.HalfStorage():elementSize()
mytester:asserteq(byte, torch.ByteTensor():elementSize())
mytester:asserteq(char, torch.CharTensor():elementSize())
@@ -3254,6 +3344,7 @@ function torchtest.elementSize()
mytester:asserteq(long, torch.LongTensor():elementSize())
mytester:asserteq(float, torch.FloatTensor():elementSize())
mytester:asserteq(double, torch.DoubleTensor():elementSize())
+ mytester:asserteq(half, torch.HalfTensor():elementSize())
mytester:assertne(byte, 0)
mytester:assertne(char, 0)
@@ -3262,6 +3353,7 @@ function torchtest.elementSize()
mytester:assertne(long, 0)
mytester:assertne(float, 0)
mytester:assertne(double, 0)
+ mytester:assertne(half, 0)
-- These tests are portable, not necessarily strict for your system.
mytester:asserteq(byte, 1)
@@ -3272,11 +3364,18 @@ function torchtest.elementSize()
mytester:assert(long >= 4)
mytester:assert(long >= int)
mytester:assert(double >= float)
+ mytester:assert(half <= float)
end
function torchtest.split()
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_split(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_split(func)
local result = {}
- local tensor = torch.rand(7,4)
+ local tensor = func(torch.rand(7,4))
local splitSize = 3
local targetSize = {{3,4},{3,4},{1,4}}
local dim = 1
@@ -3301,8 +3400,14 @@ function torchtest.split()
end
function torchtest.chunk()
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_chunk(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_chunk(func)
local result = {}
- local tensor = torch.rand(4,7)
+ local tensor = func(torch.rand(4,7))
local nChunk = 3
local targetSize = {{4,3},{4,3},{4,1}}
local dim = 2
@@ -3322,24 +3427,34 @@ function torchtest.chunk()
end
end
-function torchtest.totable()
+function torchtest.table()
+ local convStorage = {
+ ['real'] = 'FloatStorage',
+ ['half'] = 'HalfStorage'
+ }
+ for k,v in ipairs(convStorage) do
+ torchtest_totable(torch.getmetatable(torch.Tensor():type())[k], v)
+ end
+end
+
+function torchtest_totable(func, storageType)
local table0D = {}
- local tensor0D = torch.Tensor(table0D)
+ local tensor0D = func(torch.Tensor(table0D))
mytester:assertTableEq(torch.totable(tensor0D), table0D, 'tensor0D:totable incorrect')
local table1D = {1, 2, 3}
- local tensor1D = torch.Tensor(table1D)
- local storage = torch.Storage(table1D)
+ local tensor1D = func(torch.Tensor(table1D))
+ local storage = torch[storageType](table1D)
mytester:assertTableEq(tensor1D:totable(), table1D, 'tensor1D:totable incorrect')
mytester:assertTableEq(storage:totable(), table1D, 'storage:totable incorrect')
mytester:assertTableEq(torch.totable(tensor1D), table1D, 'torch.totable incorrect for Tensors')
mytester:assertTableEq(torch.totable(storage), table1D, 'torch.totable incorrect for Storages')
local table2D = {{1, 2}, {3, 4}}
- local tensor2D = torch.Tensor(table2D)
+ local tensor2D = func(torch.Tensor(table2D))
mytester:assertTableEq(tensor2D:totable(), table2D, 'tensor2D:totable incorrect')
- local tensor3D = torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})
+ local tensor3D = func(torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}))
local tensorNonContig = tensor3D:select(2, 2)
mytester:assert(not tensorNonContig:isContiguous(), 'invalid test')
mytester:assertTableEq(tensorNonContig:totable(), {{3, 4}, {7, 8}},
@@ -3347,6 +3462,12 @@ function torchtest.totable()
end
function torchtest.permute()
+ for k,v in ipairs({"real", "half"}) do
+ torchtest_permute(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function torchtest_permute(func)
local orig = {1,2,3,4,5,6,7}
local perm = torch.randperm(7):totable()
local x = torch.Tensor(unpack(orig)):fill(0)
diff --git a/test/test_half.lua b/test/test_half.lua
new file mode 100644
index 0000000..bf3830b
--- /dev/null
+++ b/test/test_half.lua
@@ -0,0 +1,55 @@
+local mytester
+local torchtest = torch.TestSuite()
+
+-- Lua 5.2 compatibility
+local loadstring = loadstring or load
+local unpack = unpack or table.unpack
+
+function torchtest.easy()
+ local x=torch.randn(5, 6):half()
+ mytester:assert(x:isContiguous(), 'x should be contiguous')
+ mytester:assert(x:dim() == 2, 'x should have dim of 2')
+ mytester:assert(x:nDimension() == 2, 'x should have nDimension of 2')
+ mytester:assert(x:nElement() == 5 * 6, 'x should have 30 elements')
+ local stride = x:stride()
+ local expectedStride = torch.LongStorage{6,1}
+ for i=1,stride:size() do
+ mytester:assert(stride[i] == expectedStride[i], "stride is wrong")
+ end
+
+ x=x:t()
+ mytester:assert(not x:isContiguous(), 'x transpose should not be contiguous')
+ x=x:transpose(1,2)
+ mytester:assert(x:isContiguous(), 'x should be contiguous after 2 transposes')
+
+ local y=torch.HalfTensor()
+ y:resizeAs(x:t()):copy(x:t())
+ mytester:assert(x:isContiguous(), 'after resize and copy, x should be contiguous')
+ mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resizeAs')
+ local z=torch.HalfTensor()
+ z:resize(6, 5):copy(x:t())
+ mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resize')
+end
+
+function torchtest.narrowSub()
+ local x = torch.randn(5, 6):half()
+ local narrow = x:narrow(1, 2, 3)
+ local sub = x:sub(2, 4)
+ mytester:assertTensorEq(narrow, sub, 0.001, 'narrow not equal to sub')
+end
+
+function torchtest.selectClone()
+ local x = torch.zeros(5, 6)
+ x:select(1,2):fill(2)
+ x=x:half()
+ local y=x:clone()
+ mytester:assertTensorEq(x, y, 0.001, 'not equal after select and clone')
+ x:select(1,1):fill(3)
+ mytester:assert(y[1][1] == 0, 'clone broken')
+end
+
+torch.setheaptracking(true)
+math.randomseed(os.time())
+mytester = torch.Tester()
+mytester:add(torchtest)
+mytester:run(tests)
diff --git a/test/test_writeObject.lua b/test/test_writeObject.lua
index 1013a96..52bcb71 100644
--- a/test/test_writeObject.lua
+++ b/test/test_writeObject.lua
@@ -4,6 +4,9 @@ local myTester = torch.Tester()
local tests = torch.TestSuite()
+function torch.HalfTensor:norm()
+ return self:real():norm()
+end
-- checks that an object can be written and unwritten
-- returns false if an error occurs
@@ -66,7 +69,13 @@ function tests.test_a_recursive_closure()
end
function tests.test_a_tensor()
- local x = torch.rand(5, 10)
+ for k,v in ipairs({"real", "half"}) do
+ tests_test_a_tensor(torch.getmetatable(torch.Tensor():type())[v])
+ end
+end
+
+function tests_test_a_tensor(func)
+ local x = func(torch.rand(5, 10))
local xcopy = serializeAndDeserialize(x)
myTester:assert(x:norm() == xcopy:norm(), 'tensors should be the same')
end
diff --git a/torchcwrap.lua b/torchcwrap.lua
index ab0df43..551bd05 100644
--- a/torchcwrap.lua
+++ b/torchcwrap.lua
@@ -202,7 +202,7 @@ types.IndexTensor = {
}
for _,typename in ipairs({"ByteTensor", "CharTensor", "ShortTensor", "IntTensor", "LongTensor",
- "FloatTensor", "DoubleTensor"}) do
+ "FloatTensor", "HalfTensor", "DoubleTensor"}) do
types[typename] = {
@@ -460,3 +460,56 @@ types.charoption = {
postcall = function(arg)
end
}
+
+for _,typename in ipairs({"ptrdiff_t", "size_t"}) do
+ types[typename] = {
+
+ helpname = function(arg)
+ return typename
+ end,
+
+ declare = function(arg)
+ -- if it is a number we initialize here
+ local default = tonumber(tostring(arg.default)) or 0
+ return string.format("%s arg%d = %g;", typename, arg.i, default)
+ end,
+
+ check = function(arg, idx)
+ return string.format("lua_isnumber(L, %d)", idx)
+ end,
+
+ read = function(arg, idx)
+ return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, typename, idx)
+ end,
+
+ init = function(arg)
+ -- otherwise do it here
+ if arg.default then
+ local default = tostring(arg.default)
+ if not tonumber(default) then
+ return string.format("arg%d = %s;", arg.i, default)
+ end
+ end
+ end,
+
+ carg = function(arg)
+ return string.format('arg%d', arg.i)
+ end,
+
+ creturn = function(arg)
+ return string.format('arg%d', arg.i)
+ end,
+
+ precall = function(arg)
+ if arg.returned then
+ return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
+ end
+ end,
+
+ postcall = function(arg)
+ if arg.creturned then
+ return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
+ end
+ end
+ }
+end