Merge branch 'master' into contiguous-cat-1d

author: Soumith Chintala <soumith@gmail.com> 2017-01-01 20:34:46 +0300
committer: GitHub <noreply@github.com> 2017-01-01 20:34:46 +0300
commit: 306984ff623886b8c07c8934623bd94fb52ed983 (patch)
tree: 75b824745614ef5a84d684dd9321981914e4930b
parent: d41580eccefcbc1d11d404e0c4ae522560f8e263 (diff)
parent: be986fb5d8611e9cb158b69d0977564e41eb93f2 (diff)
43 files changed, 2899 insertions, 220 deletions
diff --git a/FFI.lua b/FFI.lua
index 3cc0b21..333e7b5 100644
--- a/FFI.lua
+++ b/FFI.lua
@@ -15,6 +15,7 @@ local function checkArgumentType(expected, actual, fn, ud, level)
 end
 
 if ok then
+
    local Real2real = {
       Byte='unsigned char',
       Char='char',
@@ -22,7 +23,8 @@ if ok then
       Int='int',
       Long='long',
       Float='float',
-      Double='double'
+      Double='double',
+      Half='THHalf'
    }
 
    -- Allocator
@@ -34,6 +36,14 @@ typedef struct THAllocator {
 } THAllocator;
 ]]
 
+   -- Half
+   ffi.cdef[[
+typedef struct {
+  unsigned short x;
+} __THHalf;
+typedef __THHalf THHalf;
+]]
+
    -- Storage
    for Real, real in pairs(Real2real) do
 
@@ -76,7 +86,7 @@ typedef struct THRealTensor
     long *size;
     long *stride;
     int nDimension;
-    
+
     THRealStorage *storage;
     ptrdiff_t storageOffset;
     int refcount;
@@ -88,7 +98,8 @@ typedef struct THRealTensor
       cdefs = cdefs:gsub('Real', Real):gsub('real', real)
       ffi.cdef(cdefs)
 
-      local Tensor = torch.getmetatable(string.format('torch.%sTensor', Real))
+      local Tensor_type = string.format('torch.%sTensor', Real)
+      local Tensor = torch.getmetatable(Tensor_type)
       local Tensor_tt = ffi.typeof('TH' .. Real .. 'Tensor**')
 
       rawset(Tensor,
@@ -107,75 +118,77 @@ typedef struct THRealTensor
              end)
 
       -- faster apply (contiguous case)
-      local apply = Tensor.apply
-      rawset(Tensor,
-             "apply",
-             function(self, func)
-                if self:isContiguous() and self.data then
-                   local self_d = self:data()
-                   for i=0,self:nElement()-1 do
-                      local res = func(tonumber(self_d[i])) -- tonumber() required for long...
-                      if res then
-                         self_d[i] = res
+      if Tensor_type ~= 'torch.HalfTensor' then
+         local apply = Tensor.apply
+         rawset(Tensor,
+                "apply",
+                function(self, func)
+                   if self:isContiguous() and self.data then
+                      local self_d = self:data()
+                      for i=0,self:nElement()-1 do
+                         local res = func(tonumber(self_d[i])) -- tonumber() required for long...
+                         if res then
+                            self_d[i] = res
+                         end
                       end
+                      return self
+                   else
+                      return apply(self, func)
                    end
-                   return self
-                else
-                   return apply(self, func)
-                end
-             end)
-
-      -- faster map (contiguous case)
-      local map = Tensor.map
-      rawset(Tensor,
-             "map",
-             function(self, src, func)
-                checkArgument(torch.isTensor(src), "map", 1, "tensor expected")
-                checkArgumentType(self:type(), src:type(), "map", 1)
-
-                if self:isContiguous() and src:isContiguous() and self.data and src.data then
-                   local self_d = self:data()
-                   local src_d = src:data()
-                   assert(src:nElement() == self:nElement(), 'size mismatch')
-                   for i=0,self:nElement()-1 do
-                      local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long...
-                      if res then
-                         self_d[i] = res
+                end)
+
+         -- faster map (contiguous case)
+         local map = Tensor.map
+         rawset(Tensor,
+                "map",
+                function(self, src, func)
+                   checkArgument(torch.isTensor(src), "map", 1, "tensor expected")
+                   checkArgumentType(self:type(), src:type(), "map", 1)
+
+                   if self:isContiguous() and src:isContiguous() and self.data and src.data then
+                      local self_d = self:data()
+                      local src_d = src:data()
+                      assert(src:nElement() == self:nElement(), 'size mismatch')
+                      for i=0,self:nElement()-1 do
+                         local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long...
+                         if res then
+                            self_d[i] = res
+                         end
                       end
+                      return self
+                   else
+                      return map(self, src, func)
                    end
-                   return self
-                else
-                   return map(self, src, func)
-                end
-             end)
-
-      -- faster map2 (contiguous case)
-      local map2 = Tensor.map2
-      rawset(Tensor,
-             "map2",
-             function(self, src1, src2, func)
-                checkArgument(torch.isTensor(src1), "map", 1, "tensor expected")
-                checkArgument(torch.isTensor(src2), "map", 2, "tensor expected")
-                checkArgumentType(self:type(), src1:type(), "map", 1)
-                checkArgumentType(self:type(), src2:type(), "map", 2)
-
-                if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then
-                   local self_d = self:data()
-                   local src1_d = src1:data()
-                   local src2_d = src2:data()
-                   assert(src1:nElement() == self:nElement(), 'size mismatch')
-                   assert(src2:nElement() == self:nElement(), 'size mismatch')
-                   for i=0,self:nElement()-1 do
-                      local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long...
-                      if res then
-                         self_d[i] = res
+                end)
+
+         -- faster map2 (contiguous case)
+         local map2 = Tensor.map2
+         rawset(Tensor,
+                "map2",
+                function(self, src1, src2, func)
+                   checkArgument(torch.isTensor(src1), "map", 1, "tensor expected")
+                   checkArgument(torch.isTensor(src2), "map", 2, "tensor expected")
+                   checkArgumentType(self:type(), src1:type(), "map", 1)
+                   checkArgumentType(self:type(), src2:type(), "map", 2)
+
+                   if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then
+                      local self_d = self:data()
+                     local src1_d = src1:data()
+                      local src2_d = src2:data()
+                      assert(src1:nElement() == self:nElement(), 'size mismatch')
+                      assert(src2:nElement() == self:nElement(), 'size mismatch')
+                      for i=0,self:nElement()-1 do
+                         local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long...
+                         if res then
+                            self_d[i] = res
+                         end
                       end
+                      return self
+                   else
+                      return map2(self, src1, src2, func)
                    end
-                   return self
-                else
-                   return map2(self, src1, src2, func)
-                end
-             end)
+                end)
+             end
    end
 
    -- torch.data
diff --git a/Storage.c b/Storage.c
index 28c4e87..874838e 100644
--- a/Storage.c
+++ b/Storage.c
@@ -7,3 +7,6 @@
 
 #include "generic/Storage.c"
 #include "THGenerateAllTypes.h"
+
+#include "generic/Storage.c"
+#include "THGenerateHalfType.h"
diff --git a/Tensor.c b/Tensor.c
index 4bfbc6a..bf78d1a 100644
--- a/Tensor.c
+++ b/Tensor.c
@@ -7,3 +7,6 @@
 
 #include "generic/Tensor.c"
 #include "THGenerateAllTypes.h"
+
+#include "generic/Tensor.c"
+#include "THGenerateHalfType.h"
diff --git a/Tensor.lua b/Tensor.lua
index b4b3e95..9a8215b 100644
--- a/Tensor.lua
+++ b/Tensor.lua
@@ -5,14 +5,14 @@ local Storage = {}
 local Tensor = {}
 
 -- types
-local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Double'}
+local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Half', 'Double'}
 
 -- Lua 5.2 compatibility
 local log10 = math.log10 or function(x) return math.log(x, 10) end
 
 -- tostring() functions for Tensor and Storage
 local function Storage__printformat(self)
-   if self:size() == 0 then 
+   if self:size() == 0 then
      return "", nil, 0
    end
    local intMode = true
@@ -277,6 +277,10 @@ function Tensor.double(self)
    return self:type('torch.DoubleTensor')
 end
 
+function Tensor.half(self)
+   return self:type('torch.HalfTensor')
+end
+
 function Tensor.real(self)
    return self:type(torch.getdefaulttensortype())
 end
@@ -556,6 +560,14 @@ torch.permute = Tensor.permute
 for _,type in ipairs(types) do
    local metatable = torch.getmetatable('torch.' .. type .. 'Tensor')
    for funcname, func in pairs(Tensor) do
-      rawset(metatable, funcname, func)
+      if funcname ~= 'totable' or type ~='Half' then
+         rawset(metatable, funcname, func)
+      else
+         local function Tensor__totable(self)
+            local host_tensor = self:float()
+            return self:float():totable()
+         end
+         rawset(torch.getmetatable('torch.HalfTensor'), 'totable', Tensor__totable)
+      end
    end
 end
diff --git a/TensorMath.lua b/TensorMath.lua
index 5971a7b..d816740 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -6,56 +6,7 @@ local interface = wrap.CInterface.new()
 local method = wrap.CInterface.new()
 local argtypes = wrap.CInterface.argtypes
 
-argtypes['ptrdiff_t'] = {
-
-  helpname = function(arg)
-                return 'ptrdiff_t'
-             end,
-
-  declare = function(arg)
-               -- if it is a number we initialize here
-               local default = tonumber(tostring(arg.default)) or 0
-               return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
-            end,
-
-  check = function(arg, idx)
-             return string.format("lua_isnumber(L, %d)", idx)
-          end,
-
-  read = function(arg, idx)
-            return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, 'ptrdiff_t', idx)
-         end,
-
-  init = function(arg)
-            -- otherwise do it here
-            if arg.default then
-               local default = tostring(arg.default)
-               if not tonumber(default) then
-                  return string.format("arg%d = %s;", arg.i, default)
-               end
-            end
-         end,
-
-  carg = function(arg)
-            return string.format('arg%d', arg.i)
-         end,
-
-  creturn = function(arg)
-               return string.format('arg%d', arg.i)
-            end,
-
-  precall = function(arg)
-               if arg.returned then
-                  return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
-               end
-            end,
-
-  postcall = function(arg)
-                if arg.creturned then
-                   return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
-                end
-             end
-}
+argtypes['ptrdiff_t'] = wrap.types.ptrdiff_t
 
 interface:print([[
 #include "TH.h"
@@ -216,6 +167,7 @@ local reals = {ByteTensor='unsigned char',
                IntTensor='int',
                LongTensor='long',
                FloatTensor='float',
+               HalfTensor='half',
                DoubleTensor='double'}
 
 local accreals = {ByteTensor='long',
@@ -224,11 +176,12 @@ local accreals = {ByteTensor='long',
                IntTensor='long',
                LongTensor='long',
                FloatTensor='double',
+               HalfTensor='float',
                DoubleTensor='double'}
 
 for _,Tensor in ipairs({"ByteTensor", "CharTensor",
                         "ShortTensor", "IntTensor", "LongTensor",
-                        "FloatTensor", "DoubleTensor"}) do
+                        "FloatTensor", "HalfTensor", "DoubleTensor"}) do
 
    local real = reals[Tensor]
    local accreal = accreals[Tensor]
@@ -257,6 +210,7 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor",
              end
    end
 
+   if Tensor ~= 'HalfTensor' then
    wrap("zero",
         cname("zero"),
         {{name=Tensor, returned=true}})
@@ -1030,6 +984,7 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
         cname("nonzero"),
         {{name="IndexTensor", default=true, returned=true},
          {name=Tensor}})
+  end  -- ~= HalfTensor
 
    if Tensor == 'ByteTensor' then
      -- Logical accumulators only apply to ByteTensor
@@ -1089,6 +1044,14 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
             {name="double",default=0},
             {name="double",default=0}})
 
+      wrap("bhistc",
+           cname("bhistc"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="long",default=100},
+            {name="double",default=0},
+            {name="double",default=0}})
+
       wrap("norm",
            cname("normall"),
            {{name=Tensor},
diff --git a/Tester.lua b/Tester.lua
index f512edb..6509413 100644
--- a/Tester.lua
+++ b/Tester.lua
@@ -687,6 +687,7 @@ local typesMatching = {
       ['torch.LongStorage'] = torch.LongTensor,
       ['torch.FloatStorage'] = torch.FloatTensor,
       ['torch.DoubleStorage'] = torch.DoubleTensor,
+      ['torch.HalfStorage'] = torch.HalfTensor,
 }
 
 --[[ Tests for storage equality.
diff --git a/doc/maths.md b/doc/maths.md
index 44e5ea6..b4f1592 100755
--- a/doc/maths.md
+++ b/doc/maths.md
@@ -158,6 +158,43 @@ By default the elements are sorted into 100 equally spaced bins between the mini
 `y = torch.histc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
 
 
+<a name="torch.bhistc"></a>
+### [res] torch.bhistc([res,] x [,nbins, min_value, max_value]) ###
+<a name="torch.bhistc"></a>
+
+`y = torch.bhistc(x)` returns the histogram of the elements in 2d tensor `x` along the last dimension.
+By default the elements are sorted into 100 equally spaced bins between the minimum and maximum values of `x`.
+
+`y = torch.bhistc(x, n)` same as above with `n` bins.
+
+`y = torch.bhistc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
+
+```lua
+x =torch.Tensor(3, 6)
+
+> x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
+> x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 }
+> x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 }
+
+> x
+ 2  4  2  2  5  4
+ 3  5  1  5  3  5
+ 3  4  2  5  5  1
+[torch.DoubleTensor of size 3x6]
+
+> torch.bhistc(x, 5, 1, 5)
+ 0  3  0  2  1
+ 1  0  2  0  3
+ 1  1  1  1  2
+[torch.DoubleTensor of size 3x5]
+
+> y = torch.Tensor(1, 6):copy(x[1])
+
+> torch.bhistc(y, 5)
+ 3  0  2  0  1
+[torch.DoubleTensor of size 1x5]
+```
+
 <a name="torch.linspace"></a>
 ### [res] torch.linspace([res,] x1, x2, [,n]) ###
 <a name="torch.linspace"></a>
diff --git a/generic/Storage.c b/generic/Storage.c
index 134dc63..a6652a5 100644
--- a/generic/Storage.c
+++ b/generic/Storage.c
@@ -41,7 +41,7 @@ static int torch_Storage_(new)(lua_State *L)
         THStorage_(free)(storage);
         luaL_error(L, "element at index %d is not a number", i);
       }
-      THStorage_(set)(storage, i-1, (real)lua_tonumber(L, -1));
+      THStorage_(set)(storage, i-1, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
       lua_pop(L, 1);
     }
   }
@@ -131,6 +131,8 @@ static int torch_Storage_(copy)(lua_State *L)
     THStorage_(copyFloat)(storage, src);
   else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
     THStorage_(copyDouble)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) )
+    THStorage_(copyHalf)(storage, src);
   else
     luaL_typerror(L, 2, "torch.*Storage");
   lua_settop(L, 1);
diff --git a/generic/Tensor.c b/generic/Tensor.c
index abb7819..c2417fe 100644
--- a/generic/Tensor.c
+++ b/generic/Tensor.c
@@ -142,7 +142,7 @@ static int torch_Tensor_(new)(lua_State *L)
           THTensor_(free)(tensor);
           THError("invalid element (not a number)");
         }
-        THStorage_(set)(THTensor_(storage)(tensor), si++, (real)lua_tonumber(L, -1));
+        THStorage_(set)(THTensor_(storage)(tensor), si++, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1)));
         lua_pop(L, 1);
       }
 
@@ -384,6 +384,7 @@ static int torch_Tensor_(select)(lua_State *L)
   return 1;
 }
 
+#ifndef TH_REAL_IS_HALF
 static int torch_Tensor_(indexSelect)(lua_State *L)
 {
   int narg = lua_gettop(L);
@@ -567,6 +568,7 @@ static int torch_Tensor_(maskedFill)(lua_State *L)
 
   return 1;
 }
+#endif
 
 static int torch_Tensor_(transpose)(lua_State *L)
 {
@@ -675,6 +677,8 @@ static int torch_Tensor_(copy)(lua_State *L)
     THTensor_(copyFloat)(tensor, src);
   else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
     THTensor_(copyDouble)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) )
+    THTensor_(copyHalf)(tensor, src);
   else
     luaL_typerror(L, 2, "torch.*Tensor");
   lua_settop(L, 1);
@@ -700,10 +704,14 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
         THArgCheck(index >= 0 && index < tensor->size[0], 2, "out of range");
         THStorage_(set)(tensor->storage, tensor->storageOffset+index*tensor->stride[0], value);
       } else {
+#ifndef TH_REAL_IS_HALF
         tensor = THTensor_(newWithTensor)(tensor);
         THTensor_(narrow)(tensor, NULL, 0, index, 1);
         THTensor_(fill)(tensor, value);
         THTensor_(free)(tensor);
+#else
+        THError("fill on torch.HalfTensor not yet supported");
+#endif
       }
     } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
       tensor = THTensor_(newWithTensor)(tensor);
@@ -745,6 +753,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
       THTensor_(narrow)(tensor, NULL, 0, index, 1);
       THTensor_(copyDouble)(tensor, src);
       THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyHalf)(tensor, src);
+      THTensor_(free)(tensor);
     } else {
       luaL_typerror(L, 3, "torch.*Tensor");
     }
@@ -829,7 +842,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
       /* doing a copy */
       void *src;
       if (lua_isnumber(L,3)) {
-        THTensor_(fill)(tensor, lua_tonumber(L,3));
+#ifndef TH_REAL_IS_HALF
+        THTensor_(fill)(tensor, LUA_NUMBER_TO_REAL(lua_tonumber(L,3)));
+#else
+        THError("fill on torch.HalfTensor not yet supported");
+#endif
       } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
         THTensor_(copy)(tensor, src);
       } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
@@ -846,6 +863,8 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
         THTensor_(copyFloat)(tensor, src);
       } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
         THTensor_(copyDouble)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) {
+        THTensor_(copyHalf)(tensor, src);
       } else {
         luaL_typerror(L, 3, "torch.*Tensor");
       }
@@ -855,6 +874,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
   }
   else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
   {
+#ifndef TH_REAL_IS_HALF
     THTensor *vals;
     if (lua_isnumber(L, 3))
     {
@@ -868,6 +888,9 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
     {
       THError("number or " torch_Tensor " expected");
     }
+#else
+    THError("ByteTensor indexing not yet supported with half types");
+#endif
   }
   else
     lua_pushboolean(L, 0);
@@ -916,7 +939,7 @@ static int torch_Tensor_(__index__)(lua_State *L)
       THArgCheck((z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
       index += z*tensor->stride[dim];
     }
-    luaG_(pushreal)(L, (double)THStorage_(get)(THTensor_(storage)(tensor), index));
+    luaG_(pushreal)(L, THStorage_(get)(THTensor_(storage)(tensor), index));
     lua_pushboolean(L, 1);
     return 2;
   }
@@ -987,11 +1010,16 @@ static int torch_Tensor_(__index__)(lua_State *L)
   }
   else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
   {
+#ifndef TH_REAL_IS_HALF
     THTensor *vals = THTensor_(new)();
     THTensor_(maskedSelect)(vals, tensor, mask);
     luaT_pushudata(L, vals, torch_Tensor);
     lua_pushboolean(L, 1);
     return 2;
+#else
+    THError("ByteTensor based indexing not yetsupported with half type");
+    return 0;
+#endif
   }
   else
   {
@@ -1131,6 +1159,7 @@ static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index
       THArgCheck(0, index, "expecting number");
 }
 
+#ifndef TH_REAL_IS_HALF
 static int torch_Tensor_(apply)(lua_State *L)
 {
   THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
@@ -1143,7 +1172,7 @@ static int torch_Tensor_(apply)(lua_State *L)
                   lua_call(L, 1, 1);
                   if(lua_isnumber(L, 3))
                   {
-                    *tensor_data = (real)lua_tonumber(L, 3);
+                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 3));
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 3))
@@ -1169,7 +1198,7 @@ static int torch_Tensor_(map)(lua_State *L)
                   lua_call(L, 2, 1);
                   if(lua_isnumber(L, 4))
                   {
-                    *tensor_data = (real)lua_tonumber(L, 4);
+                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 4));
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 4))
@@ -1197,7 +1226,7 @@ static int torch_Tensor_(map2)(lua_State *L)
                   lua_call(L, 3, 1);
                   if(lua_isnumber(L, 5))
                   {
-                    *tensor_data = (real)lua_tonumber(L, 5);
+                    *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 5));
                     lua_pop(L, 1);
                   }
                   else if(lua_isnil(L, 5))
@@ -1208,6 +1237,7 @@ static int torch_Tensor_(map2)(lua_State *L)
   lua_settop(L, 1);
   return 1;
 }
+#endif
 
 static int torch_Tensor_(factory)(lua_State *L)
 {
@@ -1286,6 +1316,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
   {"narrow", torch_Tensor_(narrow)},
   {"sub", torch_Tensor_(sub)},
   {"select", torch_Tensor_(select)},
+#ifndef TH_REAL_IS_HALF
   {"index", torch_Tensor_(indexSelect)},
   {"indexCopy", torch_Tensor_(indexCopy)},
   {"indexAdd", torch_Tensor_(indexAdd)},
@@ -1293,6 +1324,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
   {"maskedSelect", torch_Tensor_(maskedSelect)},
   {"maskedCopy", torch_Tensor_(maskedCopy)},
   {"maskedFill", torch_Tensor_(maskedFill)},
+#endif
   {"transpose", torch_Tensor_(transpose)},
   {"t", torch_Tensor_(t)},
   {"unfold", torch_Tensor_(unfold)},
@@ -1302,9 +1334,11 @@ static const struct luaL_Reg torch_Tensor_(_) [] = {
   {"isSize", torch_Tensor_(isSize)},
   {"nElement", torch_Tensor_(nElement)},
   {"copy", torch_Tensor_(copy)},
+#ifndef TH_REAL_IS_HALF
   {"apply", torch_Tensor_(apply)},
   {"map", torch_Tensor_(map)},
   {"map2", torch_Tensor_(map2)},
+#endif
   {"read", torch_Tensor_(read)},
   {"write", torch_Tensor_(write)},
   {"__index__", torch_Tensor_(__index__)},
@@ -1318,7 +1352,10 @@ void torch_Tensor_(init)(lua_State *L)
                     torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
   luaT_setfuncs(L, torch_Tensor_(_), 0);
   lua_pop(L, 1);
+#ifndef TH_REAL_IS_HALF
   THVector_(vectorDispatchInit)();
+#endif
+
 }
 
 #endif
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
index eba6c81..e131c57 100644
--- a/generic/TensorOperator.c
+++ b/generic/TensorOperator.c
@@ -14,7 +14,7 @@ static int torch_TensorOperator_(__add__)(lua_State *L)
   {
     r = THTensor_(new)();
     luaT_pushudata(L, r, torch_Tensor);
-    
+
     if(!tensor1 && tensor2)
     {
       THTensor_(resizeAs)(r, tensor2);
@@ -49,7 +49,7 @@ static int torch_TensorOperator_(__sub__)(lua_State *L)
   {
     r = THTensor_(new)();
     luaT_pushudata(L, r, torch_Tensor);
-    
+
     if(!tensor1 && tensor2)
     {
       THTensor_(resizeAs)(r, tensor2);
@@ -98,7 +98,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
   {
     r = THTensor_(new)();
     luaT_pushudata(L, r, torch_Tensor);
-    
+
     if(!tensor1 && tensor2)
     {
       THTensor_(resizeAs)(r, tensor2);
@@ -115,7 +115,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
     {
       int dimt = tensor1->nDimension;
       int dims = tensor2->nDimension;
-      
+
       if(dimt == 1 && dims == 1)
         lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
       else if(dimt == 2 && dims == 1)
@@ -131,7 +131,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L)
         THTensor_(addmm)(r, 1, r, 1, tensor1, tensor2);
       }
       else
-        luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); 
+        luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
     }
   }
   return 1;
diff --git a/generic/luaG.h b/generic/luaG.h
index 950eae9..f1ffce2 100644
--- a/generic/luaG.h
+++ b/generic/luaG.h
@@ -6,10 +6,24 @@
 #define luaG_(NAME) TH_CONCAT_3(luaG_,Real,NAME)
 #endif
 
-static void luaG_(pushreal)(lua_State *L, accreal n) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
-	lua_pushnumber(L, (lua_Number)n);
-#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+#undef REAL_TO_LUA_NUMBER
+#undef LUA_NUMBER_TO_REAL
+
+#if defined(TH_REAL_IS_HALF)
+# define REAL_TO_LUA_NUMBER(n)   (lua_Number)TH_half2float(n)
+# define LUA_NUMBER_TO_REAL(n)    TH_float2half((lua_Number)n)
+#else
+# define REAL_TO_LUA_NUMBER(n)   (lua_Number)(n)
+# define LUA_NUMBER_TO_REAL(n)   (real)n
+#endif
+
+
+
+static void luaG_(pushreal)(lua_State *L, real n) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) || LUA_VERSION_NUM < 503
+  lua_pushnumber(L, REAL_TO_LUA_NUMBER(n));
+#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) \
+  || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
 	lua_pushinteger(L, (lua_Integer)n);
 #else
 	#error "unhandled real type in luaG_pushreal"
@@ -17,8 +31,8 @@ static void luaG_(pushreal)(lua_State *L, accreal n) {
 }
 
 static real luaG_(checkreal)(lua_State *L, int idx) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-	return (lua_Number)luaL_checknumber(L, idx);
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  return LUA_NUMBER_TO_REAL(luaL_checknumber(L, idx));
 #elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
         int type = lua_type(L, idx);
         if (type == LUA_TSTRING) {
@@ -38,8 +52,8 @@ static real luaG_(checkreal)(lua_State *L, int idx) {
 }
 
 static real luaG_(optreal)(lua_State *L, int idx, real n) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
-	return (lua_Number)luaL_optnumber(L, idx, (lua_Number)n);
+#if defined(TH_REAL_IS_HALF) || defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
+  return LUA_NUMBER_TO_REAL(luaL_optnumber(L, idx, REAL_TO_LUA_NUMBER(n)));
 #elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
 	return (lua_Integer)luaL_optinteger(L, idx, (lua_Integer)n);
 #else
diff --git a/init.c b/init.c
index 08eedba..3bdac17 100644
--- a/init.c
+++ b/init.c
@@ -16,6 +16,7 @@ extern void torch_IntStorage_init(lua_State *L);
 extern void torch_LongStorage_init(lua_State *L);
 extern void torch_FloatStorage_init(lua_State *L);
 extern void torch_DoubleStorage_init(lua_State *L);
+extern void torch_HalfStorage_init(lua_State *L);
 
 extern void torch_ByteTensor_init(lua_State *L);
 extern void torch_CharTensor_init(lua_State *L);
@@ -24,6 +25,7 @@ extern void torch_IntTensor_init(lua_State *L);
 extern void torch_LongTensor_init(lua_State *L);
 extern void torch_FloatTensor_init(lua_State *L);
 extern void torch_DoubleTensor_init(lua_State *L);
+extern void torch_HalfTensor_init(lua_State *L);
 
 extern void torch_ByteTensorOperator_init(lua_State *L);
 extern void torch_CharTensorOperator_init(lua_State *L);
@@ -33,8 +35,10 @@ extern void torch_LongTensorOperator_init(lua_State *L);
 extern void torch_FloatTensorOperator_init(lua_State *L);
 extern void torch_DoubleTensorOperator_init(lua_State *L);
 
+
 extern void torch_TensorMath_init(lua_State *L);
 
+
 LUA_EXTERNC DLL_EXPORT int luaopen_libtorch(lua_State *L);
 
 int luaopen_libtorch(lua_State *L)
@@ -45,7 +49,6 @@ int luaopen_libtorch(lua_State *L)
   lua_setglobal(L, "torch");
 
   torch_utils_init(L);
-
   torch_File_init(L);
 
   torch_ByteStorage_init(L);
@@ -55,6 +58,7 @@ int luaopen_libtorch(lua_State *L)
   torch_LongStorage_init(L);
   torch_FloatStorage_init(L);
   torch_DoubleStorage_init(L);
+  torch_HalfStorage_init(L);
 
   torch_ByteTensor_init(L);
   torch_CharTensor_init(L);
@@ -63,6 +67,7 @@ int luaopen_libtorch(lua_State *L)
   torch_LongTensor_init(L);
   torch_FloatTensor_init(L);
   torch_DoubleTensor_init(L);
+  torch_HalfTensor_init(L);
 
   torch_ByteTensorOperator_init(L);
   torch_CharTensorOperator_init(L);
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index e6cf91d..0b5f166 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -78,6 +78,21 @@ IF (CORTEXA9_FOUND)
   SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
 ENDIF (CORTEXA9_FOUND)
 
+INCLUDE(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+# Check for a cpuid intrinsic
+IF(HAVE_CPUID_H)
+    CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
+        int main()
+        {
+            unsigned int eax, ebx, ecx, edx;
+            return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+        }" HAVE_GCC_GET_CPUID)
+ENDIF()
+IF(HAVE_GCC_GET_CPUID)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
+ENDIF(HAVE_GCC_GET_CPUID)
+
 FIND_PACKAGE(SSE)
 IF(C_SSE2_FOUND)
   SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
@@ -122,11 +137,11 @@ IF(C_AVX_FOUND)
 ENDIF(C_AVX_FOUND)
 
 SET(hdr
-  THGeneral.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
-  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h)
+  THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
 
 SET(src
-  THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+  THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
   THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
 
 SET(src ${src} ${hdr} ${simd})
@@ -135,9 +150,13 @@ if(BUILD_STATIC)
   ADD_LIBRARY(TH_static STATIC ${src})
 endif()
 
+IF(NOT TH_SO_VERSION)
+  SET(TH_SO_VERSION 0)
+ENDIF(NOT TH_SO_VERSION)
+MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}")
 SET_TARGET_PROPERTIES(TH PROPERTIES
-  VERSION   0
-  SOVERSION 0)
+  VERSION   ${TH_SO_VERSION}
+  SOVERSION ${TH_SO_VERSION})
 
 CHECK_C_SOURCE_RUNS("
 #include <stdatomic.h>
@@ -320,6 +339,7 @@ INSTALL(FILES
   THFilePrivate.h
   ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
   THGenerateAllTypes.h
+  THGenerateHalfType.h
   THGenerateFloatTypes.h
   THGenerateIntTypes.h
   THLapack.h
@@ -333,6 +353,7 @@ INSTALL(FILES
   THTensorMacros.h
   THVector.h
   THAtomic.h
+  THHalf.h
   DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
 
 INSTALL(FILES
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
index 9d9cbae..01b1951 100644
--- a/lib/TH/THDiskFile.c
+++ b/lib/TH/THDiskFile.c
@@ -348,14 +348,14 @@ READ_WRITE_METHODS(int, Int,
                    int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++,
                    int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++)
 
-/*READ_WRITE_METHODS(long, Long,
-                   int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++,
-                   int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++)*/
-
 READ_WRITE_METHODS(float, Float,
                    int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
                    int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
 
+READ_WRITE_METHODS(THHalf, Half,
+                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
+                   int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
+
 READ_WRITE_METHODS(double, Double,
                    int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
                    int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++)
@@ -618,6 +618,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
     THDiskFile_readLong,
     THDiskFile_readFloat,
     THDiskFile_readDouble,
+    THDiskFile_readHalf,
     THDiskFile_readString,
 
     THDiskFile_writeByte,
@@ -627,6 +628,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
     THDiskFile_writeLong,
     THDiskFile_writeFloat,
     THDiskFile_writeDouble,
+    THDiskFile_writeHalf,
     THDiskFile_writeString,
 
     THDiskFile_synchronize,
@@ -730,6 +732,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
     THDiskFile_readLong,
     THDiskFile_readFloat,
     THDiskFile_readDouble,
+    THDiskFile_readHalf,
     THDiskFile_readString,
 
     THDiskFile_writeByte,
@@ -739,6 +742,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
     THDiskFile_writeLong,
     THDiskFile_writeFloat,
     THDiskFile_writeDouble,
+    THDiskFile_writeHalf,
     THDiskFile_writeString,
 
     THDiskFile_synchronize,
diff --git a/lib/TH/THFile.c b/lib/TH/THFile.c
index c8913af..3717b7b 100644
--- a/lib/TH/THFile.c
+++ b/lib/TH/THFile.c
@@ -19,6 +19,7 @@ IMPLEMENT_THFILE_RW(Int, int)
 IMPLEMENT_THFILE_RW(Long, long)
 IMPLEMENT_THFILE_RW(Float, float)
 IMPLEMENT_THFILE_RW(Double, double)
+IMPLEMENT_THFILE_RW(Half, THHalf)
 
 size_t THFile_readStringRaw(THFile *self, const char *format, char **str_)
 {
@@ -133,6 +134,7 @@ IMPLEMENT_THFILE_SCALAR(Int, int)
 IMPLEMENT_THFILE_SCALAR(Long, long)
 IMPLEMENT_THFILE_SCALAR(Float, float)
 IMPLEMENT_THFILE_SCALAR(Double, double)
+IMPLEMENT_THFILE_SCALAR(Half, THHalf)
 
 #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
   size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
@@ -152,3 +154,4 @@ IMPLEMENT_THFILE_STORAGE(Int, int)
 IMPLEMENT_THFILE_STORAGE(Long, long)
 IMPLEMENT_THFILE_STORAGE(Float, float)
 IMPLEMENT_THFILE_STORAGE(Double, double)
+IMPLEMENT_THFILE_STORAGE(Half, THHalf)
diff --git a/lib/TH/THFile.h b/lib/TH/THFile.h
index 64dd2da..e097bdf 100644
--- a/lib/TH/THFile.h
+++ b/lib/TH/THFile.h
@@ -74,6 +74,13 @@ TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
 TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
 TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
 
+TH_API THHalf THFile_readHalfScalar(THFile *self);
+TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar);
+TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage);
+TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size);
+TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size);
+
 TH_API void THFile_synchronize(THFile *self);
 TH_API void THFile_seek(THFile *self, size_t position);
 TH_API void THFile_seekEnd(THFile *self);
diff --git a/lib/TH/THFilePrivate.h b/lib/TH/THFilePrivate.h
index d268041..55169c3 100644
--- a/lib/TH/THFilePrivate.h
+++ b/lib/TH/THFilePrivate.h
@@ -1,3 +1,8 @@
+#include "THGeneral.h"
+
+#include "THHalf.h"
+
+
 struct THFile__
 {
     struct THFileVTable *vtable;
@@ -23,6 +28,7 @@ struct THFileVTable
     size_t (*readLong)(THFile *self, long *data, size_t n);
     size_t (*readFloat)(THFile *self, float *data, size_t n);
     size_t (*readDouble)(THFile *self, double *data, size_t n);
+    size_t (*readHalf)(THFile *self, THHalf *data, size_t n);
     size_t (*readString)(THFile *self, const char *format, char **str_);
 
     size_t (*writeByte)(THFile *self, unsigned char *data, size_t n);
@@ -32,6 +38,7 @@ struct THFileVTable
     size_t (*writeLong)(THFile *self, long *data, size_t n);
     size_t (*writeFloat)(THFile *self, float *data, size_t n);
     size_t (*writeDouble)(THFile *self, double *data, size_t n);
+    size_t (*writeHalf)(THFile *self, THHalf *data, size_t n);
     size_t (*writeString)(THFile *self, const char *str, size_t size);
 
     void (*synchronize)(THFile *self);
diff --git a/lib/TH/THGenerateAllTypes.h b/lib/TH/THGenerateAllTypes.h
index 539629b..4a77081 100644
--- a/lib/TH/THGenerateAllTypes.h
+++ b/lib/TH/THGenerateAllTypes.h
@@ -8,7 +8,6 @@
 #define THInf UCHAR_MAX
 #define TH_REAL_IS_BYTE
 #line 1 TH_GENERIC_FILE
-/*#line 1 "THByteStorage.h"*/
 #include TH_GENERIC_FILE
 #undef real
 #undef accreal
diff --git a/lib/TH/THGenerateHalfType.h b/lib/TH/THGenerateHalfType.h
new file mode 100644
index 0000000..9acc534
--- /dev/null
+++ b/lib/TH/THGenerateHalfType.h
@@ -0,0 +1,19 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h"
+#endif
+
+#include "THHalf.h"
+#define real THHalf
+#define accreal float
+#define Real Half
+#define THInf TH_HALF_MAX
+#define TH_REAL_IS_HALF
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_HALF
+
+#undef TH_GENERIC_FILE
diff --git a/lib/TH/THHalf.c b/lib/TH/THHalf.c
new file mode 100644
index 0000000..b0b9bc5
--- /dev/null
+++ b/lib/TH/THHalf.c
@@ -0,0 +1,138 @@
+#include "THHalf.h"
+
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// Host functions for converting between FP32 and FP16 formats
+// Paulius Micikevicius (pauliusm@nvidia.com)
+
+float TH_half2float(THHalf h)
+{
+    unsigned sign = ((h.x >> 15) & 1);
+    unsigned exponent = ((h.x >> 10) & 0x1f);
+    unsigned mantissa = ((h.x & 0x3ff) << 13);
+
+    if (exponent == 0x1f) {  /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) {  /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1;  /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
+
+    int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+    return *((float*)((void*)&temp));
+}
+
+THHalf TH_float2half(float f)
+{
+    THHalf ret;
+
+    unsigned x = *((int*)(void*)(&f));
+    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+        ret.x = 0x7fffU;
+        return ret;
+    }
+  
+    sign = ((x >> 16) & 0x8000);
+  
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+        ret.x = sign | 0x7c00U;
+        return ret;
+    }
+    if (u < 0x33000001) {
+        ret.x = (sign | 0x0000);
+        return ret;
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+  
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }  
+
+    ret.x = (sign | (exponent << 10) | mantissa);  
+    return ret;
+}
diff --git a/lib/TH/THHalf.h b/lib/TH/THHalf.h
new file mode 100644
index 0000000..0549d21
--- /dev/null
+++ b/lib/TH/THHalf.h
@@ -0,0 +1,40 @@
+#ifndef TH_HALF_H
+#define TH_HALF_H
+
+#include "THGeneral.h"
+#include <stdint.h>
+
+/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
+#if defined(__GNUC__)
+#define __thalign__(n) __attribute__((aligned(n)))
+#elif defined(_WIN32)
+#define __thalign__(n) __declspec(align(n))
+#else
+#define __thalign__(n)
+#endif
+
+typedef struct __thalign__(2){
+  unsigned short x;
+} __THHalf;
+
+typedef struct __thalign__(4) {
+    unsigned int x;
+} __THHalf2;
+
+typedef __THHalf THHalf;
+typedef __THHalf2 THHalf2;
+
+/* numeric limits */
+
+
+TH_API THHalf TH_float2half(float a);
+TH_API float TH_half2float(THHalf a);
+
+#ifndef TH_HALF_BITS_TO_LITERAL
+# define TH_HALF_BITS_TO_LITERAL(n) { n }
+#endif
+
+#define TH_HALF_MAX TH_HALF_BITS_TO_LITERAL(0x7BFF)
+
+#undef __thalign__
+#endif
diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c
index 8d97621..ecce6e1 100644
--- a/lib/TH/THMemoryFile.c
+++ b/lib/TH/THMemoryFile.c
@@ -332,16 +332,18 @@ READ_WRITE_METHODS(int, Int,
                    nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]),
                    1)
 
-/*READ_WRITE_METHODS(long, Long,
-                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]),
-                   1)*/
-
 READ_WRITE_METHODS(float, Float,
                    int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
                    nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]),
                    1)
 
+READ_WRITE_METHODS(THHalf, Half,
+                   int nByteRead_; float buf; \
+                   int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &buf, &nByteRead_); \
+                   data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])),
+                   1)
+
 READ_WRITE_METHODS(double, Double,
                    int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
                    nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]),
@@ -621,6 +623,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
     THMemoryFile_readLong,
     THMemoryFile_readFloat,
     THMemoryFile_readDouble,
+    THMemoryFile_readHalf,
     THMemoryFile_readString,
 
     THMemoryFile_writeByte,
@@ -630,6 +633,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
     THMemoryFile_writeLong,
     THMemoryFile_writeFloat,
     THMemoryFile_writeDouble,
+    THMemoryFile_writeHalf,
     THMemoryFile_writeString,
 
     THMemoryFile_synchronize,
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
index d18488e..bb63a43 100644
--- a/lib/TH/THStorage.c
+++ b/lib/TH/THStorage.c
@@ -4,5 +4,11 @@
 #include "generic/THStorage.c"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THStorage.c"
+#include "THGenerateHalfType.h"
+
 #include "generic/THStorageCopy.c"
 #include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.c"
+#include "THGenerateHalfType.h"
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
index 36ed507..9565e10 100644
--- a/lib/TH/THStorage.h
+++ b/lib/TH/THStorage.h
@@ -14,7 +14,13 @@
 #include "generic/THStorage.h"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THStorage.h"
+#include "THGenerateHalfType.h"
+
 #include "generic/THStorageCopy.h"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THStorageCopy.h"
+#include "THGenerateHalfType.h"
+
 #endif
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
index 2878fc9..37071df 100644
--- a/lib/TH/THTensor.c
+++ b/lib/TH/THTensor.c
@@ -11,9 +11,15 @@
 #include "generic/THTensor.c"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THTensor.c"
+#include "THGenerateHalfType.h"
+
 #include "generic/THTensorCopy.c"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THTensorCopy.c"
+#include "THGenerateHalfType.h"
+
 #include "generic/THTensorRandom.c"
 #include "THGenerateAllTypes.h"
 
diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h
index 6eddf9c..a155efd 100644
--- a/lib/TH/THTensor.h
+++ b/lib/TH/THTensor.h
@@ -16,9 +16,15 @@ typedef struct {
 #include "generic/THTensor.h"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THTensor.h"
+#include "THGenerateHalfType.h"
+
 #include "generic/THTensorCopy.h"
 #include "THGenerateAllTypes.h"
 
+#include "generic/THTensorCopy.h"
+#include "THGenerateHalfType.h"
+
 #include "THTensorMacros.h"
 
 /* random numbers */
diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c
index 6179d89..907adbb 100644
--- a/lib/TH/THVector.c
+++ b/lib/TH/THVector.c
@@ -1,10 +1,15 @@
 #include "THVector.h"
+
 #include "generic/simd/simd.h"
 
 #ifdef __NEON__
 #include "vector/NEON.c"
 #endif
 
+#ifdef __PPC64__
+#include "vector/VSX.c"
+#endif
+
 #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
         || defined(USE_SSE4_1) || defined(USE_SSE4_2)
 #include "vector/SSE.c"
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
index 6452f94..371df4d 100644
--- a/lib/TH/generic/THBlas.c
+++ b/lib/TH/generic/THBlas.c
@@ -2,6 +2,7 @@
 #define TH_GENERIC_FILE "generic/THBlas.c"
 #else
 
+
 #ifdef BLAS_F2C
 # define ffloat double
 #else
@@ -24,8 +25,8 @@ TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, doubl
 TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
 TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
 TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
-    
- 
+
+
 
 void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
 {
@@ -182,9 +183,9 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
 {
   if(n == 1)
     lda = m;
-  
+
 #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && 
+  if( (m <= INT_MAX) && (n <= INT_MAX) &&
       (lda > 0) && (lda <= INT_MAX) &&
       (incx > 0) && (incx <= INT_MAX) &&
       (incy > 0) && (incy <= INT_MAX) )
@@ -224,7 +225,7 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re
     {
       if(beta != 1)
         THBlas_(scal)(m, beta, y, incy);
-      
+
       for(j = 0; j < n; j++)
       {
         real *column_ = a+lda*j;
diff --git a/lib/TH/generic/THStorageCopy.c b/lib/TH/generic/THStorageCopy.c
index 583e088..ce4b57e 100644
--- a/lib/TH/generic/THStorageCopy.c
+++ b/lib/TH/generic/THStorageCopy.c
@@ -15,16 +15,42 @@ void THStorage_(copy)(THStorage *storage, THStorage *src)
   THStorage_(rawCopy)(storage, src->data);
 }
 
-
 #define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \
 void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
 { \
-  ptrdiff_t i; \
+  ptrdiff_t i;                                                        \
+  for(i = 0; i < storage->size; i++)                                  \
+    storage->data[i] = (real)src->data[i];                            \
+}
+
+#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  ptrdiff_t i;								\
+  for(i = 0; i < storage->size; i++)					\
+    storage->data[i] = (real)TH_half2float(src->data[i]);		\
+}
+
+#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
   THArgCheck(storage->size == src->size, 2, "size mismatch"); \
-  for(i = 0; i < storage->size; i++) \
-    storage->data[i] = (real)src->data[i]; \
+  ptrdiff_t i;								\
+  for(i = 0; i < storage->size; i++)					\
+    storage->data[i] = TH_float2half((float)(src->data[i]));		\
 }
 
+#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  ptrdiff_t i;								\
+  for(i = 0; i < storage->size; i++)					\
+    storage->data[i] = src->data[i];		\
+}
+
+#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THStorage_COPY(Byte)
 IMPLEMENT_THStorage_COPY(Char)
 IMPLEMENT_THStorage_COPY(Short)
@@ -32,5 +58,18 @@ IMPLEMENT_THStorage_COPY(Int)
 IMPLEMENT_THStorage_COPY(Long)
 IMPLEMENT_THStorage_COPY(Float)
 IMPLEMENT_THStorage_COPY(Double)
+IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
+IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
+IMPLEMENT_THStorage_COPY_TO_HALF(Char)
+IMPLEMENT_THStorage_COPY_TO_HALF(Short)
+IMPLEMENT_THStorage_COPY_TO_HALF(Int)
+IMPLEMENT_THStorage_COPY_TO_HALF(Long)
+IMPLEMENT_THStorage_COPY_TO_HALF(Float)
+IMPLEMENT_THStorage_COPY_TO_HALF(Double)
+#endif
+
 
 #endif
diff --git a/lib/TH/generic/THStorageCopy.h b/lib/TH/generic/THStorageCopy.h
index f853a82..ce8a2a6 100644
--- a/lib/TH/generic/THStorageCopy.h
+++ b/lib/TH/generic/THStorageCopy.h
@@ -13,5 +13,6 @@ TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src);
 TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src);
 TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src);
 TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src);
+TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src);
 
 #endif
diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c
index d98a2aa..1e21991 100644
--- a/lib/TH/generic/THTensorConv.c
+++ b/lib/TH/generic/THTensorConv.c
@@ -2,7 +2,6 @@
 #define TH_GENERIC_FILE "generic/THTensorConv.c"
 #else
 
-
 /*
   2D Input, 2D kernel  : convolve given image with the given kernel.
 */
@@ -775,7 +774,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
         real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
         /* get input */
         real *ptr_input = input_data + p*istride0 + i*istride1;
-  
+
         /* do image, kernel convolution */
         THTensor_(validXCorr2DRevptr)(ptr_output,
                                       alpha,
@@ -1174,7 +1173,7 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
         real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
         /* get input */
         real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
-  
+
         /* do image, kernel convolution */
         if (*vf == 'F')
           if (*xc == 'X')
@@ -1955,5 +1954,4 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }
-
 #endif
diff --git a/lib/TH/generic/THTensorConv.h b/lib/TH/generic/THTensorConv.h
index d215fcd..79866f3 100644
--- a/lib/TH/generic/THTensorConv.h
+++ b/lib/TH/generic/THTensorConv.h
@@ -2,7 +2,6 @@
 #define TH_GENERIC_FILE "generic/THTensorConv.h"
 #else
 
-
 TH_API void THTensor_(validXCorr2Dptr)(real *r_,
                                     real alpha,
                                     real *t_, long ir, long ic,
@@ -66,7 +65,7 @@ TH_API void THTensor_(fullConv3Dptr)(real *r_,
                                   long st, long sr, long sc);
 
 TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
-                                       real alpha, 
+                                       real alpha,
                                        real *t_, long it, long ir, long ic,
                                        real *k_, long kt, long kr, long kc,
                                        long st, long sr, long sc);
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
index ea6d6f1..d7228f6 100644
--- a/lib/TH/generic/THTensorCopy.c
+++ b/lib/TH/generic/THTensorCopy.c
@@ -13,6 +13,19 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
   TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)(*src_data);) \
 }
 
+#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
+}
+
+#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+ TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)TH_half2float(*src_data);) \
+}
+
+#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THTensor_COPY(Byte, unsigned char)
 IMPLEMENT_THTensor_COPY(Char, char)
 IMPLEMENT_THTensor_COPY(Short, short)
@@ -20,5 +33,18 @@ IMPLEMENT_THTensor_COPY(Int, int)
 IMPLEMENT_THTensor_COPY(Long, long)
 IMPLEMENT_THTensor_COPY(Float, float)
 IMPLEMENT_THTensor_COPY(Double, double)
+IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
+#else
+/* only allow pass-through for Half */
+IMPLEMENT_THTensor_COPY(Half, THHalf)
+IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char)
+IMPLEMENT_THTensor_COPY_TO_HALF(Char, char)
+IMPLEMENT_THTensor_COPY_TO_HALF(Short, short)
+IMPLEMENT_THTensor_COPY_TO_HALF(Int, int)
+IMPLEMENT_THTensor_COPY_TO_HALF(Long, long)
+IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
+IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
+
+#endif /* REAL_IS_HALF */
 
 #endif
diff --git a/lib/TH/generic/THTensorCopy.h b/lib/TH/generic/THTensorCopy.h
index 8d03b22..b9e5bfc 100644
--- a/lib/TH/generic/THTensorCopy.h
+++ b/lib/TH/generic/THTensorCopy.h
@@ -12,5 +12,6 @@ TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src);
 TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src);
 TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src);
 TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src);
+TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src);
 
 #endif
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index 9fc1577..78ee056 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -98,10 +98,15 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
   long i = 0;
   long dim;
   long div = 1;
+#ifdef TH_REAL_IS_HALF
+#define IS_NONZERO(val) (TH_half2float(val)!=0)
+#else
+#define IS_NONZERO(val) ((val)!=0)
+#endif
 
   /* First Pass to determine size of subscripts */
   TH_TENSOR_APPLY(real, tensor,
-                  if (*tensor_data != 0) {
+                  if IS_NONZERO(*tensor_data) {
                     ++numel;
                   });
 #ifdef DEBUG
@@ -112,7 +117,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
   /* Second pass populates subscripts */
   subscript_data = THLongTensor_data(subscript);
   TH_TENSOR_APPLY(real, tensor,
-                  if (*tensor_data != 0) {
+                  if IS_NONZERO(*tensor_data) {
                     div = 1;
 
                     for (dim = tensor->nDimension - 1; dim >= 0; dim--) {
@@ -396,6 +401,7 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
   return sum;
 }
 
+
 #undef th_isnan
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 #define th_isnan(val) \
@@ -2596,5 +2602,45 @@ void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalu
   );
 }
 
+void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
+{
+  THArgCheck(THTensor_(nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimension)(tensor));
+
+  int dimension = 1;
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(tensor), 2, "invalid dimension %d",
+      dimension + TH_INDEX_BASE);
+
+  real minval;
+  real maxval;
+  real *h_data;
+
+  THTensor_(resize2d)(hist, tensor->size[0], nbins);
+  THTensor_(zero)(hist);
+
+  minval = minvalue;
+  maxval = maxvalue;
+  if (minval == maxval)
+  {
+    minval = THTensor_(minall)(tensor);
+    maxval = THTensor_(maxall)(tensor);
+  }
+  if (minval == maxval)
+  {
+    minval = minval - 1;
+    maxval = maxval + 1;
+  }
+
+  TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, long i;
+                        for(i = 0; i < tensor_size; i++)
+                        {
+                          if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
+                            const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins);
+                            hist_data[THMin(bin, nbins-1)] += 1;
+                          }
+                        }
+  );
+}
+
 #endif /* floating point only part */
+#undef IS_NONZERO
 #endif
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
index 87f1616..c656dfd 100644
--- a/lib/TH/generic/THTensorMath.h
+++ b/lib/TH/generic/THTensorMath.h
@@ -2,8 +2,6 @@
 #define TH_GENERIC_FILE "generic/THTensorMath.h"
 #else
 
-
-
 TH_API void THTensor_(fill)(THTensor *r_, real value);
 TH_API void THTensor_(zero)(THTensor *r_);
 
@@ -163,6 +161,7 @@ TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension
 TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
 TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
 TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
+TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
 
 TH_API accreal THTensor_(meanall)(THTensor *self);
 TH_API accreal THTensor_(varall)(THTensor *self);
@@ -173,7 +172,6 @@ TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
 TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n);
 TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
 TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
-
 #endif
 
 #if defined(TH_REAL_IS_BYTE)
diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c
index 6fd1d68..2436a12 100644
--- a/lib/TH/generic/THVectorDispatch.c
+++ b/lib/TH/generic/THVectorDispatch.c
@@ -20,6 +20,12 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -32,7 +38,6 @@ void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
   THVector_(fill_DISPATCHPTR)(x, c, n);
 }
 
-
 static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(add_DEFAULT);
 static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
   #if defined(__NEON__)
@@ -41,6 +46,12 @@ static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(add_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -63,6 +74,12 @@ static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(diff_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -85,6 +102,12 @@ static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(scale_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
@@ -107,6 +130,12 @@ static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
     #endif
   #endif
 
+  #if defined(__PPC64__)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(mul_VSX), SIMDExtension_VSX),
+    #endif
+  #endif
+
   #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
           || defined(USE_SSE4_1) || defined(USE_SSE4_2)
     #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h
index b059258..ee53e2c 100644
--- a/lib/TH/generic/simd/simd.h
+++ b/lib/TH/generic/simd/simd.h
@@ -2,8 +2,10 @@
 #define TH_SIMD_INC
 
 #include <stdint.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
 #include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID)
+#include <cpuid.h>
 #endif
 
 // Can be found on Intel ISA Reference for CPUID
@@ -40,6 +42,8 @@ enum SIMDExtensions
 {
 #if defined(__NEON__)
   SIMDExtension_NEON    = 0x1,
+#elif defined(__PPC64__)
+  SIMDExtension_VSX     = 0x1,
 #else
   SIMDExtension_AVX2    = 0x1,
   SIMDExtension_AVX     = 0x2,
@@ -48,26 +52,57 @@ enum SIMDExtensions
   SIMDExtension_DEFAULT = 0x0
 };
 
-#if defined(__NEON__)
+
+#if defined(__arm__)
+
+ #if defined(__NEON__)
 
 static inline uint32_t detectHostSIMDExtensions()
 {
   return SIMDExtension_NEON;
 }
 
-#else // x86
+ #else //ARM without NEON
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_DEFAULT;
+}
+
+ #endif
+
+#elif defined(__PPC64__)
+
+ #if defined(__VSX__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_VSX;
+}
 
-#if defined(__arm__) //ARM without NEON
+ #else
 
 static inline uint32_t detectHostSIMDExtensions()
 {
   return SIMDExtension_DEFAULT;
 }
+
+ #endif
   
-#else
+#else   // x86
 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
-#ifndef _MSC_VER
+#if defined(_MSC_VER)
+  uint32_t cpuInfo[4];
+  __cpuid(cpuInfo, *eax);
+  *eax = cpuInfo[0];
+  *ebx = cpuInfo[1];
+  *ecx = cpuInfo[2];
+  *edx = cpuInfo[3];
+#elif defined(HAVE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid (level, eax, ebx, ecx, edx);
+#else
   uint32_t a = *eax, b, c, d;
   asm volatile ( "cpuid\n\t"
                  : "+a"(a), "=b"(b), "=c"(c), "=d"(d) );
@@ -75,13 +110,6 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
   *ebx = b;
   *ecx = c;
   *edx = d;
-#else
-  uint32_t cpuInfo[4];
-  __cpuid(cpuInfo, *eax);
-  *eax = cpuInfo[0];
-  *ebx = cpuInfo[1];
-  *ecx = cpuInfo[2];
-  *edx = cpuInfo[3];
 #endif
 }
 
@@ -106,7 +134,6 @@ static inline uint32_t detectHostSIMDExtensions()
   return hostSimdExts;
 }
 
-#endif // end x86 SIMD extension detection code
-#endif // end __arm__
+#endif // end SIMD extension detection code
 
 #endif
diff --git a/lib/TH/vector/VSX.c b/lib/TH/vector/VSX.c
new file mode 100644
index 0000000..14f14a7
--- /dev/null
+++ b/lib/TH/vector/VSX.c
@@ -0,0 +1,1915 @@
+#ifdef __PPC64__
+
+#include <altivec.h>
+#include <stddef.h>
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_fill_VSX was tested on Power8:
+//
+// Unrolling 128 elements is 20% faster than unrolling 64 elements.
+// Unrolling 64 elements is faster than unrolling any lesser number of elements.
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    double val[2] = {c, c};
+    vector double fp64vec2 = vec_xl(0, val);
+
+    for (i = 0; i <= n-128; i += 128)
+    {
+        vec_xst(fp64vec2, 0, x+(i    ));
+        vec_xst(fp64vec2, 0, x+(i+2  ));
+        vec_xst(fp64vec2, 0, x+(i+4  ));
+        vec_xst(fp64vec2, 0, x+(i+6  ));
+        vec_xst(fp64vec2, 0, x+(i+8  ));
+        vec_xst(fp64vec2, 0, x+(i+10 ));
+        vec_xst(fp64vec2, 0, x+(i+12 ));
+        vec_xst(fp64vec2, 0, x+(i+14 ));
+        vec_xst(fp64vec2, 0, x+(i+16 ));
+        vec_xst(fp64vec2, 0, x+(i+18 ));
+        vec_xst(fp64vec2, 0, x+(i+20 ));
+        vec_xst(fp64vec2, 0, x+(i+22 ));
+        vec_xst(fp64vec2, 0, x+(i+24 ));
+        vec_xst(fp64vec2, 0, x+(i+26 ));
+        vec_xst(fp64vec2, 0, x+(i+28 ));
+        vec_xst(fp64vec2, 0, x+(i+30 ));
+        vec_xst(fp64vec2, 0, x+(i+32 ));
+        vec_xst(fp64vec2, 0, x+(i+34 ));
+        vec_xst(fp64vec2, 0, x+(i+36 ));
+        vec_xst(fp64vec2, 0, x+(i+38 ));
+        vec_xst(fp64vec2, 0, x+(i+40 ));
+        vec_xst(fp64vec2, 0, x+(i+42 ));
+        vec_xst(fp64vec2, 0, x+(i+44 ));
+        vec_xst(fp64vec2, 0, x+(i+46 ));
+        vec_xst(fp64vec2, 0, x+(i+48 ));
+        vec_xst(fp64vec2, 0, x+(i+50 ));
+        vec_xst(fp64vec2, 0, x+(i+52 ));
+        vec_xst(fp64vec2, 0, x+(i+54 ));
+        vec_xst(fp64vec2, 0, x+(i+56 ));
+        vec_xst(fp64vec2, 0, x+(i+58 ));
+        vec_xst(fp64vec2, 0, x+(i+60 ));
+        vec_xst(fp64vec2, 0, x+(i+62 ));
+        vec_xst(fp64vec2, 0, x+(i+64 ));
+        vec_xst(fp64vec2, 0, x+(i+66 ));
+        vec_xst(fp64vec2, 0, x+(i+68 ));
+        vec_xst(fp64vec2, 0, x+(i+70 ));
+        vec_xst(fp64vec2, 0, x+(i+72 ));
+        vec_xst(fp64vec2, 0, x+(i+74 ));
+        vec_xst(fp64vec2, 0, x+(i+76 ));
+        vec_xst(fp64vec2, 0, x+(i+78 ));
+        vec_xst(fp64vec2, 0, x+(i+80 ));
+        vec_xst(fp64vec2, 0, x+(i+82 ));
+        vec_xst(fp64vec2, 0, x+(i+84 ));
+        vec_xst(fp64vec2, 0, x+(i+86 ));
+        vec_xst(fp64vec2, 0, x+(i+88 ));
+        vec_xst(fp64vec2, 0, x+(i+90 ));
+        vec_xst(fp64vec2, 0, x+(i+92 ));
+        vec_xst(fp64vec2, 0, x+(i+94 ));
+        vec_xst(fp64vec2, 0, x+(i+96 ));
+        vec_xst(fp64vec2, 0, x+(i+98 ));
+        vec_xst(fp64vec2, 0, x+(i+100));
+        vec_xst(fp64vec2, 0, x+(i+102));
+        vec_xst(fp64vec2, 0, x+(i+104));
+        vec_xst(fp64vec2, 0, x+(i+106));
+        vec_xst(fp64vec2, 0, x+(i+108));
+        vec_xst(fp64vec2, 0, x+(i+110));
+        vec_xst(fp64vec2, 0, x+(i+112));
+        vec_xst(fp64vec2, 0, x+(i+114));
+        vec_xst(fp64vec2, 0, x+(i+116));
+        vec_xst(fp64vec2, 0, x+(i+118));
+        vec_xst(fp64vec2, 0, x+(i+120));
+        vec_xst(fp64vec2, 0, x+(i+122));
+        vec_xst(fp64vec2, 0, x+(i+124));
+        vec_xst(fp64vec2, 0, x+(i+126));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        vec_xst(fp64vec2, 0, x+(i    ));
+        vec_xst(fp64vec2, 0, x+(i+2  ));
+        vec_xst(fp64vec2, 0, x+(i+4  ));
+        vec_xst(fp64vec2, 0, x+(i+6  ));
+        vec_xst(fp64vec2, 0, x+(i+8  ));
+        vec_xst(fp64vec2, 0, x+(i+10 ));
+        vec_xst(fp64vec2, 0, x+(i+12 ));
+        vec_xst(fp64vec2, 0, x+(i+14 ));
+    }
+    for (; i <= n-2; i += 2)
+        vec_xst(fp64vec2, 0, x+(i    ));
+    for (; i < n; i++)
+        x[i] = c;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// THDoubleVector_add_VSX was tested on Power8:
+//
+// Max speedup achieved when unrolling 24 elements.
+// When unrolling 32 elements, the performance was the same as for 24.
+// When unrolling 16 elements, performance was not as good as for 24.
+// Unrolling 24 elements was 43% faster than unrolling 4 elements (2.8 sec vs 4.0 sec).
+// Unrolling 24 elements was about 8% faster than unrolling 16 elements (2.8 sec vs 3.0 sec).
+//--------------------------------------------------------------------------------------------------
+static void THDoubleVector_add_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+    vector double c_fp64vec2;
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+    double val[2] = {c, c};
+    c_fp64vec2 = vec_xl(0, val);
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+        y0_fp64vec2  = vec_madd(c_fp64vec2, x0_fp64vec2,  y0_fp64vec2 );
+        y1_fp64vec2  = vec_madd(c_fp64vec2, x1_fp64vec2,  y1_fp64vec2 );
+        y2_fp64vec2  = vec_madd(c_fp64vec2, x2_fp64vec2,  y2_fp64vec2 );
+        y3_fp64vec2  = vec_madd(c_fp64vec2, x3_fp64vec2,  y3_fp64vec2 );
+        y4_fp64vec2  = vec_madd(c_fp64vec2, x4_fp64vec2,  y4_fp64vec2 );
+        y5_fp64vec2  = vec_madd(c_fp64vec2, x5_fp64vec2,  y5_fp64vec2 );
+        y6_fp64vec2  = vec_madd(c_fp64vec2, x6_fp64vec2,  y6_fp64vec2 );
+        y7_fp64vec2  = vec_madd(c_fp64vec2, x7_fp64vec2,  y7_fp64vec2 );
+        y8_fp64vec2  = vec_madd(c_fp64vec2, x8_fp64vec2,  y8_fp64vec2 );
+        y9_fp64vec2  = vec_madd(c_fp64vec2, x9_fp64vec2,  y9_fp64vec2 );
+        y10_fp64vec2 = vec_madd(c_fp64vec2, x10_fp64vec2, y10_fp64vec2);
+        y11_fp64vec2 = vec_madd(c_fp64vec2, x11_fp64vec2, y11_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        y0_fp64vec2  = vec_madd(c_fp64vec2, x0_fp64vec2,  y0_fp64vec2 );
+        y1_fp64vec2  = vec_madd(c_fp64vec2, x1_fp64vec2,  y1_fp64vec2 );
+        y2_fp64vec2  = vec_madd(c_fp64vec2, x2_fp64vec2,  y2_fp64vec2 );
+        y3_fp64vec2  = vec_madd(c_fp64vec2, x3_fp64vec2,  y3_fp64vec2 );
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y0_fp64vec2  = vec_madd(c_fp64vec2, x0_fp64vec2,  y0_fp64vec2 );
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = (c * x[i]) + y[i];
+}
+
+
+static void THDoubleVector_diff_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) {
+    ptrdiff_t i;
+
+    vector double xz0_fp64vec2, xz1_fp64vec2, xz2_fp64vec2, xz3_fp64vec2, xz4_fp64vec2, xz5_fp64vec2, xz6_fp64vec2, xz7_fp64vec2;
+    vector double xz8_fp64vec2, xz9_fp64vec2, xz10_fp64vec2, xz11_fp64vec2;
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        xz0_fp64vec2  = vec_xl(0, x+(i   ));
+        xz1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        xz2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        xz3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        xz4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        xz5_fp64vec2  = vec_xl(0, x+(i+10));
+        xz6_fp64vec2  = vec_xl(0, x+(i+12));
+        xz7_fp64vec2  = vec_xl(0, x+(i+14));
+        xz8_fp64vec2  = vec_xl(0, x+(i+16));
+        xz9_fp64vec2  = vec_xl(0, x+(i+18));
+        xz10_fp64vec2 = vec_xl(0, x+(i+20));
+        xz11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2   = vec_xl(0, y+(i   ));
+        y1_fp64vec2   = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2   = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2   = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2   = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2   = vec_xl(0, y+(i+10));
+        y6_fp64vec2   = vec_xl(0, y+(i+12));
+        y7_fp64vec2   = vec_xl(0, y+(i+14));
+        y8_fp64vec2   = vec_xl(0, y+(i+16));
+        y9_fp64vec2   = vec_xl(0, y+(i+18));
+        y10_fp64vec2  = vec_xl(0, y+(i+20));
+        y11_fp64vec2  = vec_xl(0, y+(i+22));
+
+        xz0_fp64vec2  = vec_sub(xz0_fp64vec2,  y0_fp64vec2 );
+        xz1_fp64vec2  = vec_sub(xz1_fp64vec2,  y1_fp64vec2 );
+        xz2_fp64vec2  = vec_sub(xz2_fp64vec2,  y2_fp64vec2 );
+        xz3_fp64vec2  = vec_sub(xz3_fp64vec2,  y3_fp64vec2 );
+        xz4_fp64vec2  = vec_sub(xz4_fp64vec2,  y4_fp64vec2 );
+        xz5_fp64vec2  = vec_sub(xz5_fp64vec2,  y5_fp64vec2 );
+        xz6_fp64vec2  = vec_sub(xz6_fp64vec2,  y6_fp64vec2 );
+        xz7_fp64vec2  = vec_sub(xz7_fp64vec2,  y7_fp64vec2 );
+        xz8_fp64vec2  = vec_sub(xz8_fp64vec2,  y8_fp64vec2 );
+        xz9_fp64vec2  = vec_sub(xz9_fp64vec2,  y9_fp64vec2 );
+        xz10_fp64vec2 = vec_sub(xz10_fp64vec2, y10_fp64vec2);
+        xz11_fp64vec2 = vec_sub(xz11_fp64vec2, y11_fp64vec2);
+
+        vec_xst(xz0_fp64vec2,  0, z+(i   ));
+        vec_xst(xz1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(xz2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(xz3_fp64vec2,  0, z+(i+6 ));
+        vec_xst(xz4_fp64vec2,  0, z+(i+8 ));
+        vec_xst(xz5_fp64vec2,  0, z+(i+10));
+        vec_xst(xz6_fp64vec2,  0, z+(i+12));
+        vec_xst(xz7_fp64vec2,  0, z+(i+14));
+        vec_xst(xz8_fp64vec2,  0, z+(i+16));
+        vec_xst(xz9_fp64vec2,  0, z+(i+18));
+        vec_xst(xz10_fp64vec2, 0, z+(i+20));
+        vec_xst(xz11_fp64vec2, 0, z+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        xz0_fp64vec2  = vec_xl(0, x+(i   ));
+        xz1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        xz2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        xz3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2   = vec_xl(0, y+(i   ));
+        y1_fp64vec2   = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2   = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2   = vec_xl(0, y+(i+6 ));
+
+        xz0_fp64vec2  = vec_sub(xz0_fp64vec2,  y0_fp64vec2 );
+        xz1_fp64vec2  = vec_sub(xz1_fp64vec2,  y1_fp64vec2 );
+        xz2_fp64vec2  = vec_sub(xz2_fp64vec2,  y2_fp64vec2 );
+        xz3_fp64vec2  = vec_sub(xz3_fp64vec2,  y3_fp64vec2 );
+
+        vec_xst(xz0_fp64vec2,  0, z+(i   ));
+        vec_xst(xz1_fp64vec2,  0, z+(i+2 ));
+        vec_xst(xz2_fp64vec2,  0, z+(i+4 ));
+        vec_xst(xz3_fp64vec2,  0, z+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        xz0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2   = vec_xl(0, y+(i   ));
+        xz0_fp64vec2  = vec_sub(xz0_fp64vec2,  y0_fp64vec2 );
+        vec_xst(xz0_fp64vec2,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] - y[i];
+}
+
+
+static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector double c_fp64vec2;
+    double val[2] = {c, c};
+    c_fp64vec2 = vec_xl(0, val);
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2, y12_fp64vec2, y13_fp64vec2, y14_fp64vec2, y15_fp64vec2;
+
+    for (i = 0; i <= n-32; i += 32)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+        y12_fp64vec2 = vec_xl(0, y+(i+24));
+        y13_fp64vec2 = vec_xl(0, y+(i+26));
+        y14_fp64vec2 = vec_xl(0, y+(i+28));
+        y15_fp64vec2 = vec_xl(0, y+(i+30));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  c_fp64vec2);
+        y4_fp64vec2  = vec_mul(y4_fp64vec2,  c_fp64vec2);
+        y5_fp64vec2  = vec_mul(y5_fp64vec2,  c_fp64vec2);
+        y6_fp64vec2  = vec_mul(y6_fp64vec2,  c_fp64vec2);
+        y7_fp64vec2  = vec_mul(y7_fp64vec2,  c_fp64vec2);
+        y8_fp64vec2  = vec_mul(y8_fp64vec2,  c_fp64vec2);
+        y9_fp64vec2  = vec_mul(y9_fp64vec2,  c_fp64vec2);
+        y10_fp64vec2 = vec_mul(y10_fp64vec2, c_fp64vec2);
+        y11_fp64vec2 = vec_mul(y11_fp64vec2, c_fp64vec2);
+        y12_fp64vec2 = vec_mul(y12_fp64vec2, c_fp64vec2);
+        y13_fp64vec2 = vec_mul(y13_fp64vec2, c_fp64vec2);
+        y14_fp64vec2 = vec_mul(y14_fp64vec2, c_fp64vec2);
+        y15_fp64vec2 = vec_mul(y15_fp64vec2, c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+        vec_xst(y12_fp64vec2, 0, y+(i+24));
+        vec_xst(y13_fp64vec2, 0, y+(i+26));
+        vec_xst(y14_fp64vec2, 0, y+(i+28));
+        vec_xst(y15_fp64vec2, 0, y+(i+30));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  c_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  c_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  c_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  c_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        y0_fp64vec2 = vec_xl(0, y+(i   ));
+        y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2);
+        vec_xst(y0_fp64vec2, 0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = y[i] * c;
+}
+
+
+static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
+    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
+    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
+    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
+
+
+    for (i = 0; i <= n-24; i += 24)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
+        y5_fp64vec2  = vec_xl(0, y+(i+10));
+        y6_fp64vec2  = vec_xl(0, y+(i+12));
+        y7_fp64vec2  = vec_xl(0, y+(i+14));
+        y8_fp64vec2  = vec_xl(0, y+(i+16));
+        y9_fp64vec2  = vec_xl(0, y+(i+18));
+        y10_fp64vec2 = vec_xl(0, y+(i+20));
+        y11_fp64vec2 = vec_xl(0, y+(i+22));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
+        x5_fp64vec2  = vec_xl(0, x+(i+10));
+        x6_fp64vec2  = vec_xl(0, x+(i+12));
+        x7_fp64vec2  = vec_xl(0, x+(i+14));
+        x8_fp64vec2  = vec_xl(0, x+(i+16));
+        x9_fp64vec2  = vec_xl(0, x+(i+18));
+        x10_fp64vec2 = vec_xl(0, x+(i+20));
+        x11_fp64vec2 = vec_xl(0, x+(i+22));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
+        y4_fp64vec2  = vec_mul(y4_fp64vec2,  x4_fp64vec2);
+        y5_fp64vec2  = vec_mul(y5_fp64vec2,  x5_fp64vec2);
+        y6_fp64vec2  = vec_mul(y6_fp64vec2,  x6_fp64vec2);
+        y7_fp64vec2  = vec_mul(y7_fp64vec2,  x7_fp64vec2);
+        y8_fp64vec2  = vec_mul(y8_fp64vec2,  x8_fp64vec2);
+        y9_fp64vec2  = vec_mul(y9_fp64vec2,  x9_fp64vec2);
+        y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
+        y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
+        vec_xst(y5_fp64vec2,  0, y+(i+10));
+        vec_xst(y6_fp64vec2,  0, y+(i+12));
+        vec_xst(y7_fp64vec2,  0, y+(i+14));
+        vec_xst(y8_fp64vec2,  0, y+(i+16));
+        vec_xst(y9_fp64vec2,  0, y+(i+18));
+        vec_xst(y10_fp64vec2, 0, y+(i+20));
+        vec_xst(y11_fp64vec2, 0, y+(i+22));
+    }
+    for (; i <= n-8; i += 8)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
+        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
+        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
+
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
+        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
+        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
+
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
+        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
+        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
+
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
+        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
+        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
+    }
+    for (; i <= n-2; i += 2)
+    {
+        y0_fp64vec2  = vec_xl(0, y+(i   ));
+        x0_fp64vec2  = vec_xl(0, x+(i   ));
+        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
+        vec_xst(y0_fp64vec2,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = y[i] * x[i];
+}
+
+
+
+
+
+
+
+static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    float val[4] = {c, c, c, c};
+    vector float fp32vec4 = vec_xl(0, val);
+
+    for (i = 0; i <= n-256; i += 256)
+    {
+        vec_xst(fp32vec4, 0, x+(i    ));
+        vec_xst(fp32vec4, 0, x+(i+4  ));
+        vec_xst(fp32vec4, 0, x+(i+8  ));
+        vec_xst(fp32vec4, 0, x+(i+12 ));
+        vec_xst(fp32vec4, 0, x+(i+16 ));
+        vec_xst(fp32vec4, 0, x+(i+20 ));
+        vec_xst(fp32vec4, 0, x+(i+24 ));
+        vec_xst(fp32vec4, 0, x+(i+28 ));
+        vec_xst(fp32vec4, 0, x+(i+32 ));
+        vec_xst(fp32vec4, 0, x+(i+36 ));
+        vec_xst(fp32vec4, 0, x+(i+40 ));
+        vec_xst(fp32vec4, 0, x+(i+44 ));
+        vec_xst(fp32vec4, 0, x+(i+48 ));
+        vec_xst(fp32vec4, 0, x+(i+52 ));
+        vec_xst(fp32vec4, 0, x+(i+56 ));
+        vec_xst(fp32vec4, 0, x+(i+60 ));
+        vec_xst(fp32vec4, 0, x+(i+64 ));
+        vec_xst(fp32vec4, 0, x+(i+68 ));
+        vec_xst(fp32vec4, 0, x+(i+72 ));
+        vec_xst(fp32vec4, 0, x+(i+76 ));
+        vec_xst(fp32vec4, 0, x+(i+80 ));
+        vec_xst(fp32vec4, 0, x+(i+84 ));
+        vec_xst(fp32vec4, 0, x+(i+88 ));
+        vec_xst(fp32vec4, 0, x+(i+92 ));
+        vec_xst(fp32vec4, 0, x+(i+96 ));
+        vec_xst(fp32vec4, 0, x+(i+100));
+        vec_xst(fp32vec4, 0, x+(i+104));
+        vec_xst(fp32vec4, 0, x+(i+108));
+        vec_xst(fp32vec4, 0, x+(i+112));
+        vec_xst(fp32vec4, 0, x+(i+116));
+        vec_xst(fp32vec4, 0, x+(i+120));
+        vec_xst(fp32vec4, 0, x+(i+124));
+        vec_xst(fp32vec4, 0, x+(i+128));
+        vec_xst(fp32vec4, 0, x+(i+132));
+        vec_xst(fp32vec4, 0, x+(i+136));
+        vec_xst(fp32vec4, 0, x+(i+140));
+        vec_xst(fp32vec4, 0, x+(i+144));
+        vec_xst(fp32vec4, 0, x+(i+148));
+        vec_xst(fp32vec4, 0, x+(i+152));
+        vec_xst(fp32vec4, 0, x+(i+156));
+        vec_xst(fp32vec4, 0, x+(i+160));
+        vec_xst(fp32vec4, 0, x+(i+164));
+        vec_xst(fp32vec4, 0, x+(i+168));
+        vec_xst(fp32vec4, 0, x+(i+172));
+        vec_xst(fp32vec4, 0, x+(i+176));
+        vec_xst(fp32vec4, 0, x+(i+180));
+        vec_xst(fp32vec4, 0, x+(i+184));
+        vec_xst(fp32vec4, 0, x+(i+188));
+        vec_xst(fp32vec4, 0, x+(i+192));
+        vec_xst(fp32vec4, 0, x+(i+196));
+        vec_xst(fp32vec4, 0, x+(i+200));
+        vec_xst(fp32vec4, 0, x+(i+204));
+        vec_xst(fp32vec4, 0, x+(i+208));
+        vec_xst(fp32vec4, 0, x+(i+212));
+        vec_xst(fp32vec4, 0, x+(i+216));
+        vec_xst(fp32vec4, 0, x+(i+220));
+        vec_xst(fp32vec4, 0, x+(i+224));
+        vec_xst(fp32vec4, 0, x+(i+228));
+        vec_xst(fp32vec4, 0, x+(i+232));
+        vec_xst(fp32vec4, 0, x+(i+236));
+        vec_xst(fp32vec4, 0, x+(i+240));
+        vec_xst(fp32vec4, 0, x+(i+244));
+        vec_xst(fp32vec4, 0, x+(i+248));
+        vec_xst(fp32vec4, 0, x+(i+252));
+    }
+    for (; i <= n-32; i += 32)
+    {
+        vec_xst(fp32vec4, 0, x+(i    ));
+        vec_xst(fp32vec4, 0, x+(i+4  ));
+        vec_xst(fp32vec4, 0, x+(i+8  ));
+        vec_xst(fp32vec4, 0, x+(i+12 ));
+        vec_xst(fp32vec4, 0, x+(i+16 ));
+        vec_xst(fp32vec4, 0, x+(i+20 ));
+        vec_xst(fp32vec4, 0, x+(i+24 ));
+        vec_xst(fp32vec4, 0, x+(i+28 ));
+    }
+    for (; i <= n-4; i += 4)
+        vec_xst(fp32vec4, 0, x+(i    ));
+    for (; i < n; i++)
+        x[i] = c;
+}
+
+
+static void THFloatVector_add_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+    vector float c_fp32vec4;
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+    float val[4] = {c, c, c, c};
+    c_fp32vec4 = vec_xl(0, val);
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+        x4_fp32vec4  = vec_xl(0, x+(i+16));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+        y4_fp32vec4  = vec_xl(0, y+(i+16));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+        y0_fp32vec4  = vec_madd(c_fp32vec4, x0_fp32vec4,  y0_fp32vec4 );
+        y1_fp32vec4  = vec_madd(c_fp32vec4, x1_fp32vec4,  y1_fp32vec4 );
+        y2_fp32vec4  = vec_madd(c_fp32vec4, x2_fp32vec4,  y2_fp32vec4 );
+        y3_fp32vec4  = vec_madd(c_fp32vec4, x3_fp32vec4,  y3_fp32vec4 );
+        y4_fp32vec4  = vec_madd(c_fp32vec4, x4_fp32vec4,  y4_fp32vec4 );
+        y5_fp32vec4  = vec_madd(c_fp32vec4, x5_fp32vec4,  y5_fp32vec4 );
+        y6_fp32vec4  = vec_madd(c_fp32vec4, x6_fp32vec4,  y6_fp32vec4 );
+        y7_fp32vec4  = vec_madd(c_fp32vec4, x7_fp32vec4,  y7_fp32vec4 );
+        y8_fp32vec4  = vec_madd(c_fp32vec4, x8_fp32vec4,  y8_fp32vec4 );
+        y9_fp32vec4  = vec_madd(c_fp32vec4, x9_fp32vec4,  y9_fp32vec4 );
+        y10_fp32vec4 = vec_madd(c_fp32vec4, x10_fp32vec4, y10_fp32vec4);
+        y11_fp32vec4 = vec_madd(c_fp32vec4, x11_fp32vec4, y11_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+        vec_xst(y4_fp32vec4,  0, y+(i+16));
+        vec_xst(y5_fp32vec4,  0, y+(i+20));
+        vec_xst(y6_fp32vec4,  0, y+(i+24));
+        vec_xst(y7_fp32vec4,  0, y+(i+28));
+        vec_xst(y8_fp32vec4,  0, y+(i+32));
+        vec_xst(y9_fp32vec4,  0, y+(i+36));
+        vec_xst(y10_fp32vec4, 0, y+(i+40));
+        vec_xst(y11_fp32vec4, 0, y+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+
+        y0_fp32vec4  = vec_madd(c_fp32vec4, x0_fp32vec4,  y0_fp32vec4 );
+        y1_fp32vec4  = vec_madd(c_fp32vec4, x1_fp32vec4,  y1_fp32vec4 );
+        y2_fp32vec4  = vec_madd(c_fp32vec4, x2_fp32vec4,  y2_fp32vec4 );
+        y3_fp32vec4  = vec_madd(c_fp32vec4, x3_fp32vec4,  y3_fp32vec4 );
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y0_fp32vec4  = vec_madd(c_fp32vec4, x0_fp32vec4,  y0_fp32vec4 );
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = (c * x[i]) + y[i];
+}
+
+
+
+
+static void THFloatVector_diff_VSX(float *z, const float *x, const float *y, const ptrdiff_t n) {
+    ptrdiff_t i;
+
+    vector float xz0_fp32vec4, xz1_fp32vec4, xz2_fp32vec4, xz3_fp32vec4, xz4_fp32vec4, xz5_fp32vec4, xz6_fp32vec4, xz7_fp32vec4;
+    vector float xz8_fp32vec4, xz9_fp32vec4, xz10_fp32vec4, xz11_fp32vec4;
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        xz0_fp32vec4  = vec_xl(0, x+(i   ));
+        xz1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        xz2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        xz3_fp32vec4  = vec_xl(0, x+(i+12));
+        xz4_fp32vec4  = vec_xl(0, x+(i+16));
+        xz5_fp32vec4  = vec_xl(0, x+(i+20));
+        xz6_fp32vec4  = vec_xl(0, x+(i+24));
+        xz7_fp32vec4  = vec_xl(0, x+(i+28));
+        xz8_fp32vec4  = vec_xl(0, x+(i+32));
+        xz9_fp32vec4  = vec_xl(0, x+(i+36));
+        xz10_fp32vec4 = vec_xl(0, x+(i+40));
+        xz11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4   = vec_xl(0, y+(i   ));
+        y1_fp32vec4   = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4   = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4   = vec_xl(0, y+(i+12));
+        y4_fp32vec4   = vec_xl(0, y+(i+16));
+        y5_fp32vec4   = vec_xl(0, y+(i+20));
+        y6_fp32vec4   = vec_xl(0, y+(i+24));
+        y7_fp32vec4   = vec_xl(0, y+(i+28));
+        y8_fp32vec4   = vec_xl(0, y+(i+32));
+        y9_fp32vec4   = vec_xl(0, y+(i+36));
+        y10_fp32vec4  = vec_xl(0, y+(i+40));
+        y11_fp32vec4  = vec_xl(0, y+(i+44));
+
+        xz0_fp32vec4  = vec_sub(xz0_fp32vec4,  y0_fp32vec4 );
+        xz1_fp32vec4  = vec_sub(xz1_fp32vec4,  y1_fp32vec4 );
+        xz2_fp32vec4  = vec_sub(xz2_fp32vec4,  y2_fp32vec4 );
+        xz3_fp32vec4  = vec_sub(xz3_fp32vec4,  y3_fp32vec4 );
+        xz4_fp32vec4  = vec_sub(xz4_fp32vec4,  y4_fp32vec4 );
+        xz5_fp32vec4  = vec_sub(xz5_fp32vec4,  y5_fp32vec4 );
+        xz6_fp32vec4  = vec_sub(xz6_fp32vec4,  y6_fp32vec4 );
+        xz7_fp32vec4  = vec_sub(xz7_fp32vec4,  y7_fp32vec4 );
+        xz8_fp32vec4  = vec_sub(xz8_fp32vec4,  y8_fp32vec4 );
+        xz9_fp32vec4  = vec_sub(xz9_fp32vec4,  y9_fp32vec4 );
+        xz10_fp32vec4 = vec_sub(xz10_fp32vec4, y10_fp32vec4);
+        xz11_fp32vec4 = vec_sub(xz11_fp32vec4, y11_fp32vec4);
+
+        vec_xst(xz0_fp32vec4,  0, z+(i   ));
+        vec_xst(xz1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(xz2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(xz3_fp32vec4,  0, z+(i+12));
+        vec_xst(xz4_fp32vec4,  0, z+(i+16));
+        vec_xst(xz5_fp32vec4,  0, z+(i+20));
+        vec_xst(xz6_fp32vec4,  0, z+(i+24));
+        vec_xst(xz7_fp32vec4,  0, z+(i+28));
+        vec_xst(xz8_fp32vec4,  0, z+(i+32));
+        vec_xst(xz9_fp32vec4,  0, z+(i+36));
+        vec_xst(xz10_fp32vec4, 0, z+(i+40));
+        vec_xst(xz11_fp32vec4, 0, z+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        xz0_fp32vec4  = vec_xl(0, x+(i   ));
+        xz1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        xz2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        xz3_fp32vec4  = vec_xl(0, x+(i+12));
+
+        y0_fp32vec4   = vec_xl(0, y+(i   ));
+        y1_fp32vec4   = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4   = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4   = vec_xl(0, y+(i+12));
+
+        xz0_fp32vec4  = vec_sub(xz0_fp32vec4,  y0_fp32vec4 );
+        xz1_fp32vec4  = vec_sub(xz1_fp32vec4,  y1_fp32vec4 );
+        xz2_fp32vec4  = vec_sub(xz2_fp32vec4,  y2_fp32vec4 );
+        xz3_fp32vec4  = vec_sub(xz3_fp32vec4,  y3_fp32vec4 );
+
+        vec_xst(xz0_fp32vec4,  0, z+(i   ));
+        vec_xst(xz1_fp32vec4,  0, z+(i+4 ));
+        vec_xst(xz2_fp32vec4,  0, z+(i+8 ));
+        vec_xst(xz3_fp32vec4,  0, z+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        xz0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4   = vec_xl(0, y+(i   ));
+        xz0_fp32vec4  = vec_sub(xz0_fp32vec4,  y0_fp32vec4 );
+        vec_xst(xz0_fp32vec4,  0, z+(i   ));
+    }
+    for (; i < n; i++)
+        z[i] = x[i] - y[i];
+}
+
+
+static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector float c_fp32vec4;
+    float val[4] = {c, c, c, c};
+    c_fp32vec4 = vec_xl(0, val);
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4, y12_fp32vec4, y13_fp32vec4, y14_fp32vec4, y15_fp32vec4;
+
+    for (i = 0; i <= n-64; i += 64)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+        y4_fp32vec4  = vec_xl(0, y+(i+16));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+        y12_fp32vec4 = vec_xl(0, y+(i+48));
+        y13_fp32vec4 = vec_xl(0, y+(i+52));
+        y14_fp32vec4 = vec_xl(0, y+(i+56));
+        y15_fp32vec4 = vec_xl(0, y+(i+60));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  c_fp32vec4);
+        y4_fp32vec4  = vec_mul(y4_fp32vec4,  c_fp32vec4);
+        y5_fp32vec4  = vec_mul(y5_fp32vec4,  c_fp32vec4);
+        y6_fp32vec4  = vec_mul(y6_fp32vec4,  c_fp32vec4);
+        y7_fp32vec4  = vec_mul(y7_fp32vec4,  c_fp32vec4);
+        y8_fp32vec4  = vec_mul(y8_fp32vec4,  c_fp32vec4);
+        y9_fp32vec4  = vec_mul(y9_fp32vec4,  c_fp32vec4);
+        y10_fp32vec4 = vec_mul(y10_fp32vec4, c_fp32vec4);
+        y11_fp32vec4 = vec_mul(y11_fp32vec4, c_fp32vec4);
+        y12_fp32vec4 = vec_mul(y12_fp32vec4, c_fp32vec4);
+        y13_fp32vec4 = vec_mul(y13_fp32vec4, c_fp32vec4);
+        y14_fp32vec4 = vec_mul(y14_fp32vec4, c_fp32vec4);
+        y15_fp32vec4 = vec_mul(y15_fp32vec4, c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+        vec_xst(y4_fp32vec4,  0, y+(i+16));
+        vec_xst(y5_fp32vec4,  0, y+(i+20));
+        vec_xst(y6_fp32vec4,  0, y+(i+24));
+        vec_xst(y7_fp32vec4,  0, y+(i+28));
+        vec_xst(y8_fp32vec4,  0, y+(i+32));
+        vec_xst(y9_fp32vec4,  0, y+(i+36));
+        vec_xst(y10_fp32vec4, 0, y+(i+40));
+        vec_xst(y11_fp32vec4, 0, y+(i+44));
+        vec_xst(y12_fp32vec4, 0, y+(i+48));
+        vec_xst(y13_fp32vec4, 0, y+(i+52));
+        vec_xst(y14_fp32vec4, 0, y+(i+56));
+        vec_xst(y15_fp32vec4, 0, y+(i+60));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  c_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  c_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  c_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  c_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y0_fp32vec4  = vec_mul(y0_fp32vec4, c_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = y[i] * c;
+}
+
+
+
+static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n)
+{
+    ptrdiff_t i;
+
+    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
+    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
+    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
+    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
+
+
+    for (i = 0; i <= n-48; i += 48)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+        y4_fp32vec4  = vec_xl(0, y+(i+16));
+        y5_fp32vec4  = vec_xl(0, y+(i+20));
+        y6_fp32vec4  = vec_xl(0, y+(i+24));
+        y7_fp32vec4  = vec_xl(0, y+(i+28));
+        y8_fp32vec4  = vec_xl(0, y+(i+32));
+        y9_fp32vec4  = vec_xl(0, y+(i+36));
+        y10_fp32vec4 = vec_xl(0, y+(i+40));
+        y11_fp32vec4 = vec_xl(0, y+(i+44));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+        x4_fp32vec4  = vec_xl(0, x+(i+16));
+        x5_fp32vec4  = vec_xl(0, x+(i+20));
+        x6_fp32vec4  = vec_xl(0, x+(i+24));
+        x7_fp32vec4  = vec_xl(0, x+(i+28));
+        x8_fp32vec4  = vec_xl(0, x+(i+32));
+        x9_fp32vec4  = vec_xl(0, x+(i+36));
+        x10_fp32vec4 = vec_xl(0, x+(i+40));
+        x11_fp32vec4 = vec_xl(0, x+(i+44));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
+        y4_fp32vec4  = vec_mul(y4_fp32vec4,  x4_fp32vec4);
+        y5_fp32vec4  = vec_mul(y5_fp32vec4,  x5_fp32vec4);
+        y6_fp32vec4  = vec_mul(y6_fp32vec4,  x6_fp32vec4);
+        y7_fp32vec4  = vec_mul(y7_fp32vec4,  x7_fp32vec4);
+        y8_fp32vec4  = vec_mul(y8_fp32vec4,  x8_fp32vec4);
+        y9_fp32vec4  = vec_mul(y9_fp32vec4,  x9_fp32vec4);
+        y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
+        y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+        vec_xst(y4_fp32vec4,  0, y+(i+16));
+        vec_xst(y5_fp32vec4,  0, y+(i+20));
+        vec_xst(y6_fp32vec4,  0, y+(i+24));
+        vec_xst(y7_fp32vec4,  0, y+(i+28));
+        vec_xst(y8_fp32vec4,  0, y+(i+32));
+        vec_xst(y9_fp32vec4,  0, y+(i+36));
+        vec_xst(y10_fp32vec4, 0, y+(i+40));
+        vec_xst(y11_fp32vec4, 0, y+(i+44));
+    }
+    for (; i <= n-16; i += 16)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
+        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
+        y3_fp32vec4  = vec_xl(0, y+(i+12));
+
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
+        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
+        x3_fp32vec4  = vec_xl(0, x+(i+12));
+
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
+        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
+        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
+
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
+        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
+        vec_xst(y3_fp32vec4,  0, y+(i+12));
+    }
+    for (; i <= n-4; i += 4)
+    {
+        y0_fp32vec4  = vec_xl(0, y+(i   ));
+        x0_fp32vec4  = vec_xl(0, x+(i   ));
+        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
+        vec_xst(y0_fp32vec4,  0, y+(i   ));
+    }
+    for (; i < n; i++)
+        y[i] = y[i] * x[i];
+}
+
+
+
+
+
+//------------------------------------------------
+//
+// Testing for correctness and performance
+//
+// If you want to run these tests, compile this
+// file with -DRUN_VSX_TESTS on a Power machine,
+// and then run the executable that is generated.
+//
+//------------------------------------------------
+//
+// Example passing run (from a Power8 machine):
+//
+//    $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
+//    $ ./vsxtest
+//
+//    standardDouble_fill() test took 0.34604 seconds
+//    THDoubleVector_fill_VSX() test took 0.15663 seconds
+//    All assertions PASSED for THDoubleVector_fill_VSX() test.
+//
+//    standardFloat_fill() test took 0.32901 seconds
+//    THFloatVector_fill_VSX() test took 0.07830 seconds
+//    All assertions PASSED for THFloatVector_fill_VSX() test.
+//
+//    standardDouble_add() test took 0.51602 seconds
+//    THDoubleVector_add_VSX() test took 0.31384 seconds
+//    All assertions PASSED for THDoubleVector_add_VSX() test.
+//
+//    standardFloat_add() test took 0.39845 seconds
+//    THFloatVector_add_VSX() test took 0.14544 seconds
+//    All assertions PASSED for THFloatVector_add_VSX() test.
+//
+//    standardDouble_diff() test took 0.48219 seconds
+//    THDoubleVector_diff_VSX() test took 0.31708 seconds
+//    All assertions PASSED for THDoubleVector_diff_VSX() test.
+//
+//    standardFloat_diff() test took 0.60340 seconds
+//    THFloatVector_diff_VSX() test took 0.17083 seconds
+//    All assertions PASSED for THFloatVector_diff_VSX() test.
+//
+//    standardDouble_scale() test took 0.33157 seconds
+//    THDoubleVector_scale_VSX() test took 0.19075 seconds
+//    All assertions PASSED for THDoubleVector_scale_VSX() test.
+//
+//    standardFloat_scale() test took 0.33008 seconds
+//    THFloatVector_scale_VSX() test took 0.09741 seconds
+//    All assertions PASSED for THFloatVector_scale_VSX() test.
+//
+//    standardDouble_mul() test took 0.50986 seconds
+//    THDoubleVector_mul_VSX() test took 0.30939 seconds
+//    All assertions PASSED for THDoubleVector_mul_VSX() test.
+//
+//    standardFloat_mul() test took 0.40241 seconds
+//    THFloatVector_mul_VSX() test took 0.14346 seconds
+//    All assertions PASSED for THFloatVector_mul_VSX() test.
+//
+//    Finished runnning all tests. All tests PASSED.
+//
+//------------------------------------------------
+#ifdef RUN_VSX_TESTS
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <math.h>
+
+#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
+#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
+
+void test_THDoubleVector_fill_VSX();
+
+static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
+{
+    for (ptrdiff_t i = 0; i < n; i++)
+        x[i] = c;
+}
+
+static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
+{
+    for (ptrdiff_t i = 0; i < n; i++)
+        x[i] = c;
+}
+
+static void standardDouble_add(double *y, const double *x, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] += c * x[i];
+}
+
+static void standardFloat_add(float *y, const float *x, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] += c * x[i];
+}
+
+static void standardDouble_diff(double *z, const double *x, const double *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] - y[i];
+}
+
+static void standardFloat_diff(float *z, const float *x, const float *y, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    z[i] = x[i] - y[i];
+}
+
+static void standardDouble_scale(double *y, const double c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] *= c;
+}
+
+static void standardFloat_scale(float *y, const float c, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] *= c;
+}
+
+static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] *= x[i];
+}
+
+static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n; i++)
+    y[i] *= x[i];
+}
+
+double randDouble()
+{
+    return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0);
+}
+
+int near(double a, double b)
+{
+    int aClass = fpclassify(a);
+    int bClass = fpclassify(b);
+
+    if(aClass != bClass)             // i.e. is it NAN, infinite, or finite...?
+        return 0;
+
+    if(aClass == FP_INFINITE)       // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity
+        return (signbit(a) == signbit(b));
+    else if(aClass == FP_NORMAL)    // if it is a normal number then check the magnitude of the difference between the numbers
+        return fabs(a - b) < 0.001;
+    else                            // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other.
+        return 1;
+}
+
+void test_THDoubleVector_fill_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *x_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    double yVal0 = 17.2;
+    double yVal1 = 8.2;
+    double yVal2 = 5.1;
+    double yVal3 = -0.9;
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    yVal0 += 1.0;
+    yVal1 += 1.0;
+    yVal2 += 1.0;
+    yVal3 -= 1.0;
+
+    standardDouble_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == yVal0);
+
+    standardDouble_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == x_standard[i]);
+    printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n");
+
+
+    free(x_standard);
+    free(x_optimized);
+}
+
+
+void test_THFloatVector_fill_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *x_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    float yVal0 = 17.2;
+    float yVal1 = 8.2;
+    float yVal2 = 5.1;
+    float yVal3 = -0.9;
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    yVal0 += 1.0;
+    yVal1 += 1.0;
+    yVal2 += 1.0;
+    yVal3 -= 1.0;
+
+    standardFloat_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == yVal0);
+
+    standardFloat_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+        assert(x_optimized[i] == x_standard[i]);
+    printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n");
+
+
+    free(x_standard);
+    free(x_optimized);
+}
+
+void test_THDoubleVector_add_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = (double)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        double yVal = randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_add(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_add(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_add(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_add(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_add(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_add_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+void test_THFloatVector_add_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c            = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        float yVal = (float)randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_add() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_add(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_add(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_add(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_add(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_add(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_add_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+void test_THDoubleVector_diff_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        y[i] = randDouble();
+        double zVal = randDouble();
+        z_standard[i]  = zVal;
+        z_optimized[i] = zVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_diff(    z_standard+1,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_diff(    z_standard+2,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_diff(    z_standard+3,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_diff(    z_standard+517,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_diff(    z_standard+517+r,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_diff_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(y);
+    free(x);
+}
+
+
+void test_THFloatVector_diff_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        y[i] = (float)randDouble();
+        float zVal = (float)randDouble();
+        z_standard[i]  = zVal;
+        z_optimized[i] = zVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_diff() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_diff(    z_standard+1,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_diff(    z_standard+2,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_diff(    z_standard+3,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_diff(    z_standard+517,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_diff(    z_standard+517+r,  y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(z_optimized[i], z_standard[i]))
+            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
+        assert(near(z_optimized[i], z_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_diff_VSX() test.\n\n");
+
+
+    free(z_standard);
+    free(z_optimized);
+    free(y);
+    free(x);
+}
+
+
+void test_THDoubleVector_scale_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double c            = randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        double yVal = randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_scale(    y_standard+1,  c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_scale(    y_standard+2,  c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_scale(    y_standard+3,  c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_scale(    y_standard+517,  c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_scale(    y_standard+517+r,  c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_scale_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+}
+
+
+void test_THFloatVector_scale_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float c            = (float)randDouble();
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        float yVal = (float)randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_scale() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_scale(    y_standard+1,  c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_scale(    y_standard+2,  c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_scale(    y_standard+3,  c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_scale(    y_standard+517,  c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_scale(    y_standard+517+r,  c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_scale_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+}
+
+void test_THDoubleVector_mul_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = randDouble();
+        double yVal = randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardDouble_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THDoubleVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardDouble_mul(    y_standard+1,  x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THDoubleVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardDouble_mul(    y_standard+2,  x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THDoubleVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardDouble_mul(    y_standard+3,  x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THDoubleVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardDouble_mul(    y_standard+517,  x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THDoubleVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardDouble_mul(    y_standard+517+r,  x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THDoubleVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THDoubleVector_mul_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+void test_THFloatVector_mul_VSX()
+{
+    clock_t start, end;
+    double elapsedSeconds_optimized, elapsedSeconds_standard;
+
+    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
+
+    // Initialize randomly
+    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
+    {
+        x[i] = (float)randDouble();
+        float yVal = (float)randDouble();
+        y_standard[i]  = yVal;
+        y_optimized[i] = yVal;
+    }
+
+
+    //-------------------------------------------------
+    // Performance Test
+    //-------------------------------------------------
+    start = clock();
+    standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("standardFloat_mul() test took %.5lf seconds\n", elapsedSeconds_standard);
+
+    start = clock();
+    THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS  );
+    THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1);
+    THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2);
+    THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3);
+    end = clock();
+
+    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("THFloatVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
+
+
+    //-------------------------------------------------
+    // Correctness Test
+    //-------------------------------------------------
+    standardFloat_mul(    y_standard+1,  x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    THFloatVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2);
+    standardFloat_mul(    y_standard+2,  x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    THFloatVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4);
+    standardFloat_mul(    y_standard+3,  x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    THFloatVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6);
+    standardFloat_mul(    y_standard+517,  x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    THFloatVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
+    int r = rand() % 258;
+    standardFloat_mul(    y_standard+517+r,  x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    THFloatVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
+    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
+    {
+        if(!near(y_optimized[i], y_standard[i]))
+            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
+        assert(near(y_optimized[i], y_standard[i]));
+    }
+    printf("All assertions PASSED for THFloatVector_mul_VSX() test.\n\n");
+
+
+    free(y_standard);
+    free(y_optimized);
+    free(x);
+}
+
+
+
+int main()
+{
+    printf("\n");
+
+
+    // First test utility functions
+
+    assert(!near(0.1, -0.1));
+    assert(!near(0.1f, -0.1f));
+    assert(!near(9, 10));
+    assert(near(0.1, 0.1000001));
+    assert(near(0.1f, 0.1000001f));
+    assert(near(100.764, 100.764));
+    assert(!near(NAN, 0.0));
+    assert(!near(-9.5, NAN));
+    assert(!near(NAN, 100));
+    assert(!near(-0.0, NAN));
+    assert(near(NAN, NAN));
+    assert(near(INFINITY, INFINITY));
+    assert(near(-INFINITY, -INFINITY));
+    assert(!near(INFINITY, NAN));
+    assert(!near(0, INFINITY));
+    assert(!near(-999.4324, INFINITY));
+    assert(!near(INFINITY, 982374.1));
+    assert(!near(-INFINITY, INFINITY));
+
+
+
+    // Then test each vectorized function
+
+    test_THDoubleVector_fill_VSX();
+    test_THFloatVector_fill_VSX();
+
+    test_THDoubleVector_add_VSX();
+    test_THFloatVector_add_VSX();
+
+    test_THDoubleVector_diff_VSX();
+    test_THFloatVector_diff_VSX();
+
+    test_THDoubleVector_scale_VSX();
+    test_THFloatVector_scale_VSX();
+
+    test_THDoubleVector_mul_VSX();
+    test_THFloatVector_mul_VSX();
+
+
+    printf("Finished runnning all tests. All tests PASSED.\n");
+    return 0;
+}
+
+
+#endif  // defined RUN_VSX_TESTS
+
+#endif  // defined __PPC64__
+
diff --git a/test/test.lua b/test/test.lua
index eb7cf0a..e7e26e4 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -19,6 +19,35 @@ local function maxdiff(x,y)
    end
 end
 
+-- workarounds for non-existant functions
+function torch.HalfTensor:__sub(other)
+   return (self:real() - other:real()):half()
+end
+
+function torch.HalfTensor:mean(dim)
+   return self:real():mean(dim):half()
+end
+
+function torch.HalfTensor:abs()
+   return self:real():abs():half()
+end
+
+function torch.HalfTensor:max()
+   return self:real():max()
+end
+
+function torch.HalfTensor:add(a, b)
+   return (self:real():add(a, b:real())):half()
+end
+
+function torch.HalfTensor:reshape(a, b)
+   return (self:real():reshape(a, b)):half()
+end
+
+function torch.HalfTensor:fill(a)
+   return self:real():fill(a):half()
+end
+
 function torchtest.dot()
    local types = {
       ['torch.DoubleTensor'] = 1e-8, -- for ddot
@@ -1355,6 +1384,18 @@ function torchtest.histc()
    local z = torch.Tensor{ 0, 3, 0, 2, 1 }
    mytester:assertTensorEq(y,z,precision,'error in torch.histc')
 end
+function torchtest.bhistc()
+   local x = torch.Tensor(3, 6)
+   x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
+   x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 }
+   x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 }
+   local y = torch.bhistc(x, 5, 1, 5) -- nbins, min, max
+   local z = torch.Tensor(3, 5)
+   z[1] = torch.Tensor{ 0, 3, 0, 2, 1 }
+   z[2] = torch.Tensor{ 1, 0, 2, 0, 3 }
+   z[3] = torch.Tensor{ 1, 1, 1, 1, 2 }
+   mytester:assertTensorEq(y,z,precision,'error in torch.bhistc in last dimension')
+end
 function torchtest.ones()
    local mx = torch.ones(msize,msize)
    local mxx = torch.Tensor()
@@ -3103,7 +3144,13 @@ function torchtest.isTypeOfPattern()
 end
 
 function torchtest.isTensor()
-   local t = torch.randn(3,4)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_isTensor(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_isTensor(func)
+   local t = func(torch.randn(3,4))
    mytester:assert(torch.isTensor(t), 'error in isTensor')
    mytester:assert(torch.isTensor(t[1]), 'error in isTensor for subTensor')
    mytester:assert(not torch.isTensor(t[1][2]), 'false positive in isTensor')
@@ -3111,14 +3158,26 @@ function torchtest.isTensor()
 end
 
 function torchtest.isStorage()
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_isStorage(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_isStorage(func)
   local t = torch.randn(3,4)
   mytester:assert(torch.isStorage(t:storage()), 'error in isStorage')
   mytester:assert(not torch.isStorage(t), 'false positive in isStorage')
 end
 
 function torchtest.view()
-   local tensor = torch.rand(15)
-   local template = torch.rand(3,5)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_view(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_view(func)
+   local tensor = func(torch.rand(15))
+   local template = func(torch.rand(3,5))
    local target = template:size():totable()
    mytester:assertTableEq(tensor:viewAs(template):size():totable(), target, 'Error in viewAs')
    mytester:assertTableEq(tensor:view(3,5):size():totable(), target, 'Error in view')
@@ -3129,7 +3188,7 @@ function torchtest.view()
    tensor_view:fill(torch.rand(1)[1])
    mytester:asserteq((tensor_view-tensor):abs():max(), 0, 'Error in view')
 
-   local target_tensor = torch.Tensor()
+   local target_tensor = func(torch.Tensor())
    mytester:assertTableEq(target_tensor:viewAs(tensor, template):size():totable(), target, 'Error in viewAs')
    mytester:assertTableEq(target_tensor:view(tensor, 3,5):size():totable(), target, 'Error in view')
    mytester:assertTableEq(target_tensor:view(tensor, torch.LongStorage{3,5}):size():totable(), target, 'Error in view using LongStorage')
@@ -3140,9 +3199,15 @@ function torchtest.view()
 end
 
 function torchtest.expand()
-   local result = torch.Tensor()
-   local tensor = torch.rand(8,1)
-   local template = torch.rand(8,5)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_expand(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_expand(func)
+   local result = func(torch.Tensor())
+   local tensor = func(torch.rand(8,1))
+   local template = func(torch.rand(8,5))
    local target = template:size():totable()
    mytester:assertTableEq(tensor:expandAs(template):size():totable(), target, 'Error in expandAs')
    mytester:assertTableEq(tensor:expand(8,5):size():totable(), target, 'Error in expand')
@@ -3157,8 +3222,14 @@ function torchtest.expand()
 end
 
 function torchtest.repeatTensor()
-   local result = torch.Tensor()
-   local tensor = torch.rand(8,4)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_repeatTensor(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_repeatTensor(func, mean)
+   local result = func(torch.Tensor())
+   local tensor = func(torch.rand(8,4))
    local size = {3,1,1}
    local sizeStorage = torch.LongStorage(size)
    local target = {3,8,4}
@@ -3172,10 +3243,16 @@ function torchtest.repeatTensor()
 end
 
 function torchtest.isSameSizeAs()
-   local t1 = torch.Tensor(3, 4, 9, 10)
-   local t2 = torch.Tensor(3, 4)
-   local t3 = torch.Tensor(1, 9, 3, 3)
-   local t4 = torch.Tensor(3, 4, 9, 10)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_isSameSizeAs(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_isSameSizeAs(func)
+   local t1 = func(torch.Tensor(3, 4, 9, 10))
+   local t2 = func(torch.Tensor(3, 4))
+   local t3 = func(torch.Tensor(1, 9, 3, 3))
+   local t4 = func(torch.Tensor(3, 4, 9, 10))
 
    mytester:assert(t1:isSameSizeAs(t2) == false, "wrong answer ")
    mytester:assert(t1:isSameSizeAs(t3) == false, "wrong answer ")
@@ -3183,15 +3260,21 @@ function torchtest.isSameSizeAs()
 end
 
 function torchtest.isSetTo()
-   local t1 = torch.Tensor(3, 4, 9, 10)
-   local t2 = torch.Tensor(3, 4, 9, 10)
-   local t3 = torch.Tensor():set(t1)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_isSetTo(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_isSetTo(func)
+   local t1 = func(torch.Tensor(3, 4, 9, 10))
+   local t2 = func(torch.Tensor(3, 4, 9, 10))
+   local t3 = func(torch.Tensor()):set(t1)
    local t4 = t3:reshape(12, 90)
    mytester:assert(t1:isSetTo(t2) == false, "tensors do not share storage")
    mytester:assert(t1:isSetTo(t3) == true, "tensor is set to other")
    mytester:assert(t3:isSetTo(t1) == true, "isSetTo should be symmetric")
    mytester:assert(t1:isSetTo(t4) == false, "tensors have different view")
-   mytester:assert(not torch.Tensor():isSetTo(torch.Tensor()),
+   mytester:assert(not func(torch.Tensor()):isSetTo(func(torch.Tensor())),
                    "Tensors with no storages should not appear to be set " ..
                    "to each other")
 end
@@ -3229,7 +3312,13 @@ function torchtest.equal()
 end
 
 function torchtest.isSize()
-  local t1 = torch.Tensor(3, 4, 5)
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_isSize(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_isSize(func)
+  local t1 = func(torch.Tensor(3, 4, 5))
   local s1 = torch.LongStorage({3, 4, 5})
   local s2 = torch.LongStorage({5, 4, 3})
 
@@ -3246,6 +3335,7 @@ function torchtest.elementSize()
   local long   =   torch.LongStorage():elementSize()
   local float  =  torch.FloatStorage():elementSize()
   local double = torch.DoubleStorage():elementSize()
+  local half = torch.HalfStorage():elementSize()
 
   mytester:asserteq(byte,   torch.ByteTensor():elementSize())
   mytester:asserteq(char,   torch.CharTensor():elementSize())
@@ -3254,6 +3344,7 @@ function torchtest.elementSize()
   mytester:asserteq(long,   torch.LongTensor():elementSize())
   mytester:asserteq(float,  torch.FloatTensor():elementSize())
   mytester:asserteq(double, torch.DoubleTensor():elementSize())
+  mytester:asserteq(half, torch.HalfTensor():elementSize())
 
   mytester:assertne(byte, 0)
   mytester:assertne(char, 0)
@@ -3262,6 +3353,7 @@ function torchtest.elementSize()
   mytester:assertne(long, 0)
   mytester:assertne(float, 0)
   mytester:assertne(double, 0)
+  mytester:assertne(half, 0)
 
   -- These tests are portable, not necessarily strict for your system.
   mytester:asserteq(byte, 1)
@@ -3272,11 +3364,18 @@ function torchtest.elementSize()
   mytester:assert(long >= 4)
   mytester:assert(long >= int)
   mytester:assert(double >= float)
+  mytester:assert(half <= float)
 end
 
 function torchtest.split()
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_split(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_split(func)
    local result = {}
-   local tensor = torch.rand(7,4)
+   local tensor = func(torch.rand(7,4))
    local splitSize = 3
    local targetSize = {{3,4},{3,4},{1,4}}
    local dim = 1
@@ -3301,8 +3400,14 @@ function torchtest.split()
 end
 
 function torchtest.chunk()
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_chunk(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_chunk(func)
    local result = {}
-   local tensor = torch.rand(4,7)
+   local tensor = func(torch.rand(4,7))
    local nChunk = 3
    local targetSize = {{4,3},{4,3},{4,1}}
    local dim = 2
@@ -3322,24 +3427,34 @@ function torchtest.chunk()
    end
 end
 
-function torchtest.totable()
+function torchtest.table()
+   local convStorage = {
+     ['real'] = 'FloatStorage',
+     ['half'] = 'HalfStorage'
+   }
+   for k,v in ipairs(convStorage) do
+      torchtest_totable(torch.getmetatable(torch.Tensor():type())[k], v)
+   end
+end
+
+function torchtest_totable(func, storageType)
   local table0D = {}
-  local tensor0D = torch.Tensor(table0D)
+  local tensor0D = func(torch.Tensor(table0D))
   mytester:assertTableEq(torch.totable(tensor0D), table0D, 'tensor0D:totable incorrect')
 
   local table1D = {1, 2, 3}
-  local tensor1D = torch.Tensor(table1D)
-  local storage = torch.Storage(table1D)
+  local tensor1D = func(torch.Tensor(table1D))
+  local storage = torch[storageType](table1D)
   mytester:assertTableEq(tensor1D:totable(), table1D, 'tensor1D:totable incorrect')
   mytester:assertTableEq(storage:totable(), table1D, 'storage:totable incorrect')
   mytester:assertTableEq(torch.totable(tensor1D), table1D, 'torch.totable incorrect for Tensors')
   mytester:assertTableEq(torch.totable(storage), table1D, 'torch.totable incorrect for Storages')
 
   local table2D = {{1, 2}, {3, 4}}
-  local tensor2D = torch.Tensor(table2D)
+  local tensor2D = func(torch.Tensor(table2D))
   mytester:assertTableEq(tensor2D:totable(), table2D, 'tensor2D:totable incorrect')
 
-  local tensor3D = torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})
+  local tensor3D = func(torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}))
   local tensorNonContig = tensor3D:select(2, 2)
   mytester:assert(not tensorNonContig:isContiguous(), 'invalid test')
   mytester:assertTableEq(tensorNonContig:totable(), {{3, 4}, {7, 8}},
@@ -3347,6 +3462,12 @@ function torchtest.totable()
 end
 
 function torchtest.permute()
+   for k,v in ipairs({"real", "half"}) do
+      torchtest_permute(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function torchtest_permute(func)
   local orig = {1,2,3,4,5,6,7}
   local perm = torch.randperm(7):totable()
   local x = torch.Tensor(unpack(orig)):fill(0)
diff --git a/test/test_half.lua b/test/test_half.lua
new file mode 100644
index 0000000..bf3830b
--- /dev/null
+++ b/test/test_half.lua
@@ -0,0 +1,55 @@
+local mytester
+local torchtest = torch.TestSuite()
+
+-- Lua 5.2 compatibility
+local loadstring = loadstring or load
+local unpack = unpack or table.unpack
+
+function torchtest.easy()
+   local x=torch.randn(5, 6):half()
+   mytester:assert(x:isContiguous(), 'x should be contiguous')
+   mytester:assert(x:dim() == 2, 'x should have dim of 2')
+   mytester:assert(x:nDimension() == 2, 'x should have nDimension of 2')
+   mytester:assert(x:nElement() == 5 * 6, 'x should have 30 elements')
+   local stride = x:stride()
+   local expectedStride = torch.LongStorage{6,1}
+   for i=1,stride:size() do
+      mytester:assert(stride[i] == expectedStride[i], "stride is wrong")
+   end
+
+   x=x:t()
+   mytester:assert(not x:isContiguous(), 'x transpose should not be contiguous')
+   x=x:transpose(1,2)
+   mytester:assert(x:isContiguous(), 'x should be contiguous after 2 transposes')
+
+   local y=torch.HalfTensor()
+   y:resizeAs(x:t()):copy(x:t())
+   mytester:assert(x:isContiguous(), 'after resize and copy, x should be contiguous')
+   mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resizeAs')
+   local z=torch.HalfTensor()
+   z:resize(6, 5):copy(x:t())
+   mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resize')
+end
+
+function torchtest.narrowSub()
+   local x = torch.randn(5, 6):half()
+   local narrow = x:narrow(1, 2, 3)
+   local sub = x:sub(2, 4)
+   mytester:assertTensorEq(narrow, sub, 0.001, 'narrow not equal to sub')
+end
+
+function torchtest.selectClone()
+   local x = torch.zeros(5, 6)
+   x:select(1,2):fill(2)
+   x=x:half()
+   local y=x:clone()
+   mytester:assertTensorEq(x, y, 0.001, 'not equal after select and clone')
+   x:select(1,1):fill(3)
+   mytester:assert(y[1][1] == 0, 'clone broken')
+end
+
+torch.setheaptracking(true)
+math.randomseed(os.time())
+mytester = torch.Tester()
+mytester:add(torchtest)
+mytester:run(tests)
diff --git a/test/test_writeObject.lua b/test/test_writeObject.lua
index 1013a96..52bcb71 100644
--- a/test/test_writeObject.lua
+++ b/test/test_writeObject.lua
@@ -4,6 +4,9 @@ local myTester = torch.Tester()
 
 local tests = torch.TestSuite()
 
+function torch.HalfTensor:norm()
+   return self:real():norm()
+end
 
 -- checks that an object can be written and unwritten
 -- returns false if an error occurs
@@ -66,7 +69,13 @@ function tests.test_a_recursive_closure()
 end
 
 function tests.test_a_tensor()
-   local x = torch.rand(5, 10)
+   for k,v in ipairs({"real", "half"}) do
+      tests_test_a_tensor(torch.getmetatable(torch.Tensor():type())[v])
+   end
+end
+
+function tests_test_a_tensor(func)
+   local x = func(torch.rand(5, 10))
    local xcopy = serializeAndDeserialize(x)
    myTester:assert(x:norm() == xcopy:norm(), 'tensors should be the same')
 end
diff --git a/torchcwrap.lua b/torchcwrap.lua
index ab0df43..551bd05 100644
--- a/torchcwrap.lua
+++ b/torchcwrap.lua
@@ -202,7 +202,7 @@ types.IndexTensor = {
 }
 
 for _,typename in ipairs({"ByteTensor", "CharTensor", "ShortTensor", "IntTensor", "LongTensor",
-                          "FloatTensor", "DoubleTensor"}) do
+                          "FloatTensor", "HalfTensor", "DoubleTensor"}) do
 
    types[typename] = {
 
@@ -460,3 +460,56 @@ types.charoption = {
    postcall = function(arg)
               end
 }
+
+for _,typename in ipairs({"ptrdiff_t", "size_t"}) do
+  types[typename] =  {
+
+  helpname = function(arg)
+                return typename
+             end,
+
+  declare = function(arg)
+               -- if it is a number we initialize here
+               local default = tonumber(tostring(arg.default)) or 0
+               return string.format("%s arg%d = %g;", typename, arg.i, default)
+            end,
+
+  check = function(arg, idx)
+             return string.format("lua_isnumber(L, %d)", idx)
+          end,
+
+  read = function(arg, idx)
+            return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, typename, idx)
+         end,
+
+  init = function(arg)
+            -- otherwise do it here
+            if arg.default then
+               local default = tostring(arg.default)
+               if not tonumber(default) then
+                  return string.format("arg%d = %s;", arg.i, default)
+               end
+            end
+         end,
+
+  carg = function(arg)
+            return string.format('arg%d', arg.i)
+         end,
+
+  creturn = function(arg)
+               return string.format('arg%d', arg.i)
+            end,
+
+  precall = function(arg)
+               if arg.returned then
+                  return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
+               end
+            end,
+
+  postcall = function(arg)
+                if arg.creturned then
+                   return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i)
+                end
+             end
+  }
+end
author	Soumith Chintala <soumith@gmail.com>	2017-01-01 20:34:46 +0300
committer	GitHub <noreply@github.com>	2017-01-01 20:34:46 +0300
commit	306984ff623886b8c07c8934623bd94fb52ed983 (patch)
tree	75b824745614ef5a84d684dd9321981914e4930b
parent	d41580eccefcbc1d11d404e0c4ae522560f8e263 (diff)
parent	be986fb5d8611e9cb158b69d0977564e41eb93f2 (diff)