require "torch"
paths.require("libcutorch")

torch.CudaByteStorage.__tostring__   = torch.ByteStorage.__tostring__
torch.CudaByteTensor.__tostring__    = torch.ByteTensor.__tostring__
torch.CudaCharStorage.__tostring__   = torch.CharStorage.__tostring__
torch.CudaCharTensor.__tostring__    = torch.CharTensor.__tostring__
torch.CudaShortStorage.__tostring__  = torch.ShortStorage.__tostring__
torch.CudaShortTensor.__tostring__   = torch.ShortTensor.__tostring__
torch.CudaIntStorage.__tostring__    = torch.IntStorage.__tostring__
torch.CudaIntTensor.__tostring__     = torch.IntTensor.__tostring__
torch.CudaLongStorage.__tostring__   = torch.LongStorage.__tostring__
torch.CudaLongTensor.__tostring__    = torch.LongTensor.__tostring__
torch.CudaStorage.__tostring__       = torch.FloatStorage.__tostring__
torch.CudaTensor.__tostring__        = torch.FloatTensor.__tostring__
torch.CudaDoubleStorage.__tostring__ = torch.DoubleStorage.__tostring__
torch.CudaDoubleTensor.__tostring__  = torch.DoubleTensor.__tostring__
if cutorch.hasHalf then
   torch.CudaHalfStorage.__tostring__  = torch.HalfStorage.__tostring__
   torch.CudaHalfTensor.__tostring__  = torch.HalfTensor.__tostring__
end

require('cutorch.Tensor')
require('cutorch.FFI')
require('cutorch.test')

local unpack = unpack or table.unpack

function cutorch.withDevice(newDeviceID, closure)
    local curDeviceID = cutorch.getDevice()
    cutorch.setDevice(newDeviceID)
    local vals = {pcall(closure)}
    cutorch.setDevice(curDeviceID)
    if vals[1] then
       return unpack(vals, 2)
    end
    error(unpack(vals, 2))
end

local function longTensorSize(...)
   local size
   if not ... then
      size = torch.LongTensor{0}
   elseif torch.isStorage(...) then
      size = torch.LongTensor(...)
   else
      size = torch.LongTensor{...}
   end
   return size
end

local hostTypes = {'Float', 'Double', 'Int', 'Long', 'Byte'}
if cutorch.hasHalf then
   table.insert(hostTypes, 'Half')
end

for _, ty in ipairs(hostTypes) do
   -- Creates torch Tensors using the CudaHostAllocator.
   -- Accepts either a LongStorage or a sequence of numbers.
   cutorch['createCudaHost' .. ty .. 'Tensor'] = function(...)
      local size = longTensorSize(...)
      local storage = torch[ty .. 'Storage'](cutorch.CudaHostAllocator, size:prod())
      return torch[ty .. 'Tensor'](storage, 1, size:storage())
   end
end

-- Alias to automate creation from both torch and cutorch types
cutorch.createCudaHostTensor = cutorch.createCudaHostFloatTensor

-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
local function _createUVATensor(...)
   local size = longTensorSize(...)
   -- See CUDA_C_Programming_guide.pdf for detailed explanation about synchronization
   -- Section J.
   -- "It is worth a comment on the synchronization between host and device. Notice how in
   -- the non-managed example, the synchronous cudaMemcpy() routine is used both to
   -- synchronize the kernel (that is, to wait for it to finish running), and to transfer the data
   -- to the host. The Unified Memory examples do not call cudaMemcpy() and so require an
   -- explicit cudaDeviceSynchronize() before the host program can safely use the output
   -- from the GPU."
   -- Section J.2.2.1.
   -- " Note that if memory is dynamically allocated with cudaMallocManaged() or
   -- cuMemAllocManaged() while the GPU is active, the behavior of the memory is
   -- unspecified until additional work is launched or the GPU is synchronized. Attempting
   -- to access the memory on the CPU during this time may or may not cause a segmentation
   -- fault."
   cutorch.synchronize()
   local storage = torch.FloatStorage(cutorch.CudaUVAAllocator, size:prod())
   return torch.FloatTensor(storage)
end

function cutorch.createFloatUVATensor(...)
   return _createUVATensor(...)
end

-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
-- First creates a UVA backed FloatTensor and takes its pointer.
function cutorch.createCudaUVATensor(...)
   -- Delegate actual allocation and synchronization to CPU tensor and
   -- take the pointer.
   local ft = _createUVATensor(...)
   local storage = torch.CudaStorage(
      ft:storage():size(),
      tonumber(torch.data(ft:storage(), true))
   )
   return torch.CudaTensor(storage)
end

-- UVA storage is a single memory location backed by virtual addressing.
-- Converting between CPU / GPU tensor types is done by raw pointer passing.
-- We only support FloatTensor, CudaTensor, Cuda -> float and float -> Cuda atm
function cutorch.toFloatUVATensor(t)
   if not torch.isTensor(t) then
      error('Must use a tensor, got ' .. torch.type(t))
   end
   local storage = torch.FloatStorage(
      t:storage():size(),
      tonumber(torch.data(t:storage(), true))
   )
   assert(cutorch.isManaged(storage))
   return torch.FloatTensor(storage)
end

function cutorch.toCudaUVATensor(t)
   if not torch.isTensor(t) then
      error('Must use a tensor, got ' .. torch.type(t))
   end
   local storage = torch.CudaStorage(
      t:storage():size(),
      tonumber(torch.data(t:storage(), true))
   )
   assert(cutorch.isManaged(storage))
   return torch.CudaTensor(storage)
end

function cutorch.isManaged(t)
   if not torch.isTensor(t) and not torch.isStorage(t) then
      error('Usage: cutorch.isManaged(Tensor|Storage), got ' .. torch.type(t))
   end
   return cutorch.isManagedPtr(tonumber(torch.data(t, true)))
end

-- remove this line to disable automatic cutorch heap-tracking
-- for garbage collection
cutorch.setHeapTracking(true)


function torch.multinomialAliasSetup(probs, state)
   if torch.type(state) == 'table' then 
      state[1], state[2] = torch.multinomialAliasSetup_(probs, state[1], state[2])
   else
      state = {}
      state[1], state[2] = torch.multinomialAliasSetup_(probs)
    end
    return state
 end

function torch.multinomialAlias(output, state)
   torch.CudaTensor.multinomialAlias_(output, state[1], state[2])
   return output
end
return cutorch