#include "utils.h" #include "luaT.h" #include "THCGeneral.h" #include "THCCachingAllocator.h" #include "THCCachingHostAllocator.h" #include "THCSleep.h" #include "THCTensorRandom.h" #include "THCHalf.h" // for CUDA_HALF_TENSOR extern void cutorch_CudaByteStorage_init(lua_State* L); extern void cutorch_CudaCharStorage_init(lua_State* L); extern void cutorch_CudaShortStorage_init(lua_State* L); extern void cutorch_CudaIntStorage_init(lua_State* L); extern void cutorch_CudaLongStorage_init(lua_State* L); extern void cutorch_CudaStorage_init(lua_State* L); extern void cutorch_CudaDoubleStorage_init(lua_State* L); #ifdef CUDA_HALF_TENSOR extern void cutorch_CudaHalfStorage_init(lua_State* L); #else extern void cutorch_HalfStorageCopy_init(lua_State *L); #endif extern void cutorch_CudaByteTensor_init(lua_State* L); extern void cutorch_CudaCharTensor_init(lua_State* L); extern void cutorch_CudaShortTensor_init(lua_State* L); extern void cutorch_CudaIntTensor_init(lua_State* L); extern void cutorch_CudaLongTensor_init(lua_State* L); extern void cutorch_CudaTensor_init(lua_State* L); extern void cutorch_CudaDoubleTensor_init(lua_State* L); #ifdef CUDA_HALF_TENSOR extern void cutorch_CudaHalfTensor_init(lua_State* L); #else extern void cutorch_HalfTensorCopy_init(lua_State *L); #endif extern void cutorch_CudaByteTensorOperator_init(lua_State* L); extern void cutorch_CudaCharTensorOperator_init(lua_State* L); extern void cutorch_CudaShortTensorOperator_init(lua_State* L); extern void cutorch_CudaIntTensorOperator_init(lua_State* L); extern void cutorch_CudaLongTensorOperator_init(lua_State* L); extern void cutorch_CudaTensorOperator_init(lua_State* L); extern void cutorch_CudaDoubleTensorOperator_init(lua_State* L); #ifdef CUDA_HALF_TENSOR extern void cutorch_CudaHalfTensorOperator_init(lua_State* L); #endif extern void cutorch_CudaByteTensorMath_init(lua_State* L); extern void cutorch_CudaCharTensorMath_init(lua_State* L); extern void cutorch_CudaShortTensorMath_init(lua_State* L); extern void cutorch_CudaIntTensorMath_init(lua_State* L); extern void cutorch_CudaLongTensorMath_init(lua_State* L); extern void cutorch_CudaTensorMath_init(lua_State* L); extern void cutorch_CudaDoubleTensorMath_init(lua_State* L); #ifdef CUDA_HALF_TENSOR extern void cutorch_CudaHalfTensorMath_init(lua_State* L); #endif /* Iteration utilities for lists of streams and lists of gpus with streams */ int checkAndCountListOfStreams(lua_State *L, THCState *state, int arg, int device) { if (!lua_istable(L, arg)) { THError("expecting array of device streams"); } /* Push table to top */ lua_pushvalue(L, arg); /* Check that all values in the table are numeric and in bounds */ int streams = 0; lua_pushnil(L); while (lua_next(L, -2)) { if (!lua_isnumber(L, -2)) { THError("expected array of streams, not table"); } if (!lua_isnumber(L, -1)) { THError("array of stream ids must contain numeric ids"); } int streamId = (int) lua_tonumber(L, -1); /* This will error out if the stream is not in bounds */ THCState_getDeviceStream(state, device, streamId); ++streams; lua_pop(L, 1); } /* Pop table from top */ lua_pop(L, 1); return streams; } void checkAndCountListOfGPUStreamPairs(lua_State *L, THCState *state, int arg, int* gpus, int* streams) { if (!lua_istable(L, arg)) { THError("expecting table of gpu={streams...}"); } /* Push table to top */ lua_pushvalue(L, arg); /* Check that all values in the table are tables of numeric and in bounds */ *gpus = 0; *streams = 0; lua_pushnil(L); while (lua_next(L, -2)) { /* -2 is key (device), -1 is value, in the form device={streams...} */ if (!lua_isnumber(L, -2) || !lua_istable(L, -1)) { THError("expecting table of gpu={streams...}"); } int device = (int) lua_tonumber(L, -2) - 1; /* Verify device is in range */ if (device < 0 || device >= THCState_getNumDevices(state)) { THError("%d is not a device", device + 1); } /* Verify that the list is a list of streams */ *streams += checkAndCountListOfStreams(L, state, -1, device); ++(*gpus); lua_pop(L, 1); } /* Pop table from top */ lua_pop(L, 1); } int createSingleDeviceEvents(lua_State *L, THCState *state, int arg, int device, cudaEvent_t* event) { /* Push table to top */ lua_pushvalue(L, arg); /* Record events */ lua_pushnil(L); int i = 0; while (lua_next(L, -2)) { int streamId = (int) lua_tonumber(L, -1); cudaStream_t streamWaitingOn = THCState_getDeviceStream(state, device, streamId); THCudaCheck(cudaEventCreateWithFlags(&event[i], cudaEventDisableTiming)); THCudaCheck(cudaEventRecord(event[i], streamWaitingOn)); lua_pop(L, 1); i++; } /* Pop table from top */ lua_pop(L, 1); return i; } void createMultiDeviceEvents(lua_State *L, THCState *state, int arg, cudaEvent_t* events) { /* Push {gpu={streams...}} table */ lua_pushvalue(L, arg); /* Create and record events per each GPU */ int gpu = 0; lua_pushnil(L); while (lua_next(L, -2)) { int device = (int) lua_tonumber(L, -2) - 1; THCudaCheck(cudaSetDevice(device)); events += createSingleDeviceEvents(L, state, -1, device, events); ++gpu; lua_pop(L, 1); } /* Pop {gpu={streams...}} table */ lua_pop(L, 1); } void waitSingleDeviceEvents(lua_State *L, THCState *state, int arg, int device, cudaEvent_t * event, int numEvents) { /* Push table to top */ lua_pushvalue(L, arg); /* Then, wait on the events. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ lua_pushnil(L); while (lua_next(L, -2)) { int streamId = (int) lua_tonumber(L, -1); cudaStream_t stream = THCState_getDeviceStream(state, device, streamId); for (int i = 0; i < numEvents; i++) { THCudaCheck(cudaStreamWaitEvent(stream, event[i], 0)); } lua_pop(L, 1); } /* Pop table from top */ lua_pop(L, 1); } void waitMultiDeviceEvents(lua_State *L, THCState *state, int arg, cudaEvent_t* events, int streams) { /* Push {gpu={streams...}} table */ lua_pushvalue(L, arg); /* Then, wait on the events. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ lua_pushnil(L); while (lua_next(L, -2)) { int device = (int) lua_tonumber(L, -2) - 1; THCudaCheck(cudaSetDevice(device)); /* Push stream table */ lua_pushvalue(L, -1); lua_pushnil(L); while (lua_next(L, -2)) { int streamId = (int) lua_tonumber(L, -1); cudaStream_t stream = THCState_getDeviceStream(state, device, streamId); /* Each stream waits on all events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaStreamWaitEvent(stream, events[i], 0)); } lua_pop(L, 1); } /* Pop stream table and GPU entry */ lua_pop(L, 2); } /* Pop {gpu={streams...}} table */ lua_pop(L, 1); } /* Synchronizes the host with respect to the current device */ static int cutorch_synchronize(lua_State *L) { THCudaCheck(cudaDeviceSynchronize()); return 0; } /* Synchronizes the host with respect to all devices */ static int cutorch_synchronizeAll(lua_State *L) { int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); int devices = -1; THCudaCheck(cudaGetDeviceCount(&devices)); for (int i = 0; i < devices; ++i) { THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaDeviceSynchronize()); } THCudaCheck(cudaSetDevice(prevDev)); return 0; } /* Usage: cutorch.reserveStreams(n) Allocates n user streams for every device present. If fewer than n streams are currently allocated, an additional number will be added. If more than n streams are currently allocated, does nothing. The default CUDA stream is assumed to be stream 0 and is always present; the allocated streams are user streams on top of the CUDA streams (thus, reserveStreams(1) will create 1 user stream with two being available, the default stream 0 and the user stream 1, on each device). */ static int cutorch_reserveStreams(lua_State *L) { THCState *state = cutorch_getstate(L); int numStreams = (int) luaL_checknumber(L, 1); int nonBlocking = lua_toboolean(L, 2); THCState_reserveStreams(state, numStreams, nonBlocking); return 0; } /* Usage: cutorch.reserveBlasHandles(n) Allocates n blasHandles for every device present. If fewer than n blasHandles are currently allocated, an additional number will be added. If more than n blasHandles are currently allocated, does nothing. Unlike for streams, there is no default blasHandle. */ static int cutorch_reserveBlasHandles(lua_State *L) { THCState *state = cutorch_getstate(L); int numHandles = (int) luaL_checknumber(L, 1); THCState_reserveBlasHandles(state, numHandles); return 0; } /* Usage: n = cutorch.getNumStreams() Returns the number of user streams allocated for every device present. By default, is 0. */ static int cutorch_getNumStreams(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getNumStreams(state)); return 1; } /* Usage: n = cutorch.getNumBlasHandles() Returns the number of user blasHandles allocated for every device present. By default, is 1. */ static int cutorch_getNumBlasHandles(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getNumBlasHandles(state)); return 1; } /* Usage: cutorch.setStream(n) For all devices, sets the current user stream in use to the index specified. e.g., --- cutorch.setDevice(1) cutorch.setStream(3) -- device 1 stream 3 in use here cutorch.setDevice(2) -- device 2 stream 3 in use here --- 0 is the default stream on the device. */ static int cutorch_setStream(lua_State *L) { THCState *state = cutorch_getstate(L); int stream = (int) luaL_checknumber(L, 1); THCState_setCurrentStreamIndex(state, stream); return 0; } /* Usage: cutorch.setBlasHandle(n) For all devices, sets the current blasHandle in use to the index specified. e.g., --- cutorch.setDevice(1) cutorch.setBlasHandle(3) -- device 1 blasHandle 3 in use here cutorch.setDevice(2) -- device 2 blasHandle 3 in use here --- */ static int cutorch_setBlasHandle(lua_State *L) { THCState *state = cutorch_getstate(L); int handle = (int) luaL_checknumber(L, 1); THCState_setCurrentBlasHandleIndex(state, handle); return 0; } /* Usage: n = cutorch.getStream() Returns the current user stream for all devices in use (as previously set via cutorch.setStream(n). 0 is the default stream on the device and is its initial value. */ static int cutorch_getStream(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getCurrentStreamIndex(state)); return 1; } /* Usage: n = cutorch.getBlasHandle() Returns the current blasHandle for all devices in use (as previously set via cutorch.setBlasHandle(n). */ static int cutorch_getBlasHandle(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getCurrentBlasHandleIndex(state)); return 1; } /* Usage: cutorch.setDefaultStream() Equivalent to cutorch.setStream(0). */ static int cutorch_setDefaultStream(lua_State *L) { THCState *state = cutorch_getstate(L); THCState_setStream(state, NULL); return 0; } /* Usage: cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN}) for streams on the current device. Creates a one-way barrier where waiterStream waits for waitForStream1-N to reach the current point. */ static int cutorch_streamWaitFor(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* Check that the waiting stream is in bounds; this will error out if not */ int waitingId = (int) luaL_checknumber(L, 1); cudaStream_t streamWaiting = THCState_getDeviceStream(state, curDev, waitingId); /* Validate the streams that we are waiting on */ int streams = checkAndCountListOfStreams(L, state, 2, curDev); if (streams < 1) { /* nothing to synchronize */ return 0; } /* One-way dependency; streamWaiting will wait for the list of streams to wait on to complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); createSingleDeviceEvents(L, state, 2, curDev, events); /* Then, wait on them */ for (int i = 0; i < streams; i++) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); THCudaCheck(cudaEventDestroy(events[i])); } free(events); return 0; } /* Usage: cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter, {[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Stream (gpuWaiter, streamWaiter) will wait on all of the other streams (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way barrier only (only streamWaiter is blocked). The streams to wait on are bucketed per device. Equivalent to streamWaitFor() if only one GPU's streams are listed. */ static int cutorch_streamWaitForMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate waiting (gpu, stream); this will error out if not */ int gpuWaiter = (int) luaL_checknumber(L, 1) - 1; int streamWaiter = (int) luaL_checknumber(L, 2); cudaStream_t streamWaiting = THCState_getDeviceStream(state, gpuWaiter, streamWaiter); /* Validate and count set of {gpu={streams...}} we are waiting on */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams); if (streams < 1) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create and record event per each stream given for that GPU. -For (gpuWaiter, streamWaiter), wait on all of the above events. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 3, events); /* Then, wait on the events */ THCudaCheck(cudaSetDevice(gpuWaiter)); for (int i = 0; i < streams; ++i) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); } /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; } /* Usage: cutorch.streamBarrier({stream1, stream2, ..., streamN}) applies to streams for the current device. Creates a N-way barrier to synchronize all of the streams given */ static int cutorch_streamBarrier(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); int streams = checkAndCountListOfStreams(L, state, 1, curDev); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Multi-way dependency (barrier); all streams must complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); /* First, create an event and record them for all streams */ int eventsCreated = createSingleDeviceEvents(L, state, 1, curDev, events); /* Then, wait on the event. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated); for (int i = 0; i < eventsCreated; i++) THCudaCheck(cudaEventDestroy(events[i])); free(events); return 0; } /* usage: cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait for all others to complete fully. Streams are bucketed per device. Equivalent to streamBarrier() if only one GPU is specified. */ static int cutorch_streamBarrierMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate and count set of {gpu={streams...}} that are mutually waiting */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create an event, and record that event on each stream given for that GPU. -For each GPU, for each stream, wait on the event created by each other GPU. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 1, events); /* Then, wait on the events. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitMultiDeviceEvents(L, state, 1, events, streams); /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; } /* Usage: cutorch.streamSynchronize(n) For the current device, synchronizes with the given stream only (cudaStreamSynchronize). 0 is the default stream on the device. */ static int cutorch_streamSynchronize(lua_State *L) { THCState *state = cutorch_getstate(L); int streamId = (int) luaL_checknumber(L, 1); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* This also validates the stream */ cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId); THCudaCheck(cudaStreamSynchronize(stream)); return 0; } static int cutorch_getDevice(lua_State *L) { int device; THCudaCheck(cudaGetDevice(&device)); device++; lua_pushnumber(L, device); return 1; } static int cutorch_deviceReset(lua_State *L) { printf("WARNING: cutorch.deviceReset has been depreceated." " Just remove the call from your code.\n"); return 0; } static int cutorch_getDeviceCount(lua_State *L) { int ndevice; THCudaCheck(cudaGetDeviceCount(&ndevice)); lua_pushnumber(L, ndevice); return 1; } static int cutorch_getPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); int dev = (int) luaL_checknumber(L, 1) - 1; int devToAccess = (int) luaL_checknumber(L, 2) - 1; /* device bounds checking is performed within */ int enabled = THCState_getPeerToPeerAccess(state, dev, devToAccess); lua_pushboolean(L, enabled); return 1; } static int cutorch_setPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); int dev = (int) luaL_checknumber(L, 1) - 1; int devToAccess = (int) luaL_checknumber(L, 2) - 1; int enable = lua_toboolean(L, 3); /* device bounds checking is performed within */ THCState_setPeerToPeerAccess(state, dev, devToAccess, enable); return 0; } static int cutorch_getKernelPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushboolean(L, THCState_getKernelPeerToPeerAccessEnabled(state)); return 1; } static int cutorch_setKernelPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); int val = lua_toboolean(L, -1); THCState_setKernelPeerToPeerAccessEnabled(state, val); return 0; } static int cutorch_isCachingAllocatorEnabled(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushboolean(L, THCState_isCachingAllocatorEnabled(state)); return 1; } static int cutorch_getMemoryUsage(lua_State *L) { size_t freeBytes = 0; size_t totalBytes = 0; int curDevice; THCudaCheck(cudaGetDevice(&curDevice)); THCState *state = cutorch_getstate(L); int device = luaL_optint(L, 1, -10); if (device == -10) { /* no argument passed, current device mem usage */ THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes)); } else { /* argument was given, particular device's memory usage */ THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */ THCudaCheck(THCudaMemGetInfo(state, &freeBytes, &totalBytes)); THCudaCheck(cudaSetDevice(curDevice)); } lua_pushnumber(L, freeBytes); lua_pushnumber(L, totalBytes); return 2; } static int cutorch_setDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaSetDevice(device)); return 0; } #define SET_DEVN_PROP(NAME) \ lua_pushnumber(L, prop.NAME); \ lua_setfield(L, -2, #NAME); static int cutorch_getDeviceProperties(lua_State *L) { int device = (int)luaL_checknumber(L, 1)-1; // switch context to given device so the call to cudaMemGetInfo is for the correct device int oldDevice; THCudaCheck(cudaGetDevice(&oldDevice)); THCudaCheck(cudaSetDevice(device)); struct cudaDeviceProp prop; THCudaCheck(cudaGetDeviceProperties(&prop, device)); lua_newtable(L); SET_DEVN_PROP(canMapHostMemory); SET_DEVN_PROP(clockRate); SET_DEVN_PROP(computeMode); SET_DEVN_PROP(deviceOverlap); SET_DEVN_PROP(integrated); SET_DEVN_PROP(kernelExecTimeoutEnabled); SET_DEVN_PROP(major); SET_DEVN_PROP(maxThreadsPerBlock); SET_DEVN_PROP(memPitch); SET_DEVN_PROP(minor); SET_DEVN_PROP(multiProcessorCount); SET_DEVN_PROP(regsPerBlock); SET_DEVN_PROP(sharedMemPerBlock); SET_DEVN_PROP(textureAlignment); SET_DEVN_PROP(totalConstMem); SET_DEVN_PROP(totalGlobalMem); SET_DEVN_PROP(warpSize); SET_DEVN_PROP(pciBusID); SET_DEVN_PROP(pciDeviceID); SET_DEVN_PROP(pciDomainID); SET_DEVN_PROP(maxTexture1D); SET_DEVN_PROP(maxTexture1DLinear); size_t freeMem; THCudaCheck(cudaMemGetInfo (&freeMem, NULL)); lua_pushnumber(L, freeMem); lua_setfield(L, -2, "freeGlobalMem"); lua_pushstring(L, prop.name); lua_setfield(L, -2, "name"); // restore context THCudaCheck(cudaSetDevice(oldDevice)); return 1; } static int cutorch_getRuntimeVersion(lua_State *L) { int version; THCudaCheck(cudaRuntimeGetVersion(&version)); lua_pushnumber(L, version); return 1; } static int cutorch_getDriverVersion(lua_State *L) { int version; THCudaCheck(cudaDriverGetVersion(&version)); lua_pushnumber(L, version); return 1; } static int cutorch_seed(lua_State *L) { unsigned long long seed = THCRandom_seed(cutorch_getstate(L)); lua_pushnumber(L, seed); return 1; } static int cutorch_seedAll(lua_State *L) { unsigned long long seed = THCRandom_seedAll(cutorch_getstate(L)); lua_pushnumber(L, seed); return 1; } static int cutorch_initialSeed(lua_State *L) { unsigned long long seed = THCRandom_initialSeed(cutorch_getstate(L)); lua_pushnumber(L, seed); return 1; } static int cutorch_manualSeed(lua_State *L) { unsigned long long seed = luaL_checknumber(L, 1); THCRandom_manualSeed(cutorch_getstate(L), seed); return 0; } static int cutorch_manualSeedAll(lua_State* L) { unsigned long long seed = luaL_checknumber(L, 1); THCRandom_manualSeedAll(cutorch_getstate(L), seed); return 0; } static int cutorch_getRNGState(lua_State *L) { THByteTensor* t = THByteTensor_new(); THCRandom_getRNGState(cutorch_getstate(L), t); luaT_pushudata(L, t, "torch.ByteTensor"); return 1; } static int cutorch_setRNGState(lua_State *L) { THByteTensor* t = luaT_checkudata(L, 1, "torch.ByteTensor"); THCRandom_setRNGState(cutorch_getstate(L), t); return 0; } static int cutorch_getState(lua_State *L) { lua_getglobal(L, "cutorch"); lua_getfield(L, -1, "_state"); lua_remove(L, -2); return 1; } static int cutorch_Event_new(lua_State *L) { cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t)); THCudaCheck(cudaEventCreate(event)); THCState *state = cutorch_getstate(L); THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state))); luaT_pushudata(L, event, "cutorch.Event"); return 1; } static int cutorch_Event_free(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCudaCheck(cudaEventDestroy(*event)); luaT_free(L, event); return 0; } static int cutorch_Event_waitOn(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCState *state = cutorch_getstate(L); THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0)); return 0; } static const struct luaL_Reg cutorch_Event__[] = { {"waitOn", cutorch_Event_waitOn}, {NULL, NULL} }; static void cutorch_Event_init(lua_State *L) { luaT_newmetatable(L, "cutorch.Event", NULL, cutorch_Event_new, cutorch_Event_free, NULL); luaT_setfuncs(L, cutorch_Event__, 0); lua_pop(L, 1); } static void luaCutorchGCFunction(void *data) { lua_State *L = data; lua_gc(L, LUA_GCCOLLECT, 0); } static int cutorch_setHeapTracking(lua_State *L) { THCState *state = cutorch_getstate(L); int enabled = luaT_checkboolean(L,1); if(enabled) { THCSetGCHandler(state, luaCutorchGCFunction, L); } else { THCSetGCHandler(state, NULL, NULL); } return 0; } static int cutorch_isManagedPtr(lua_State *L) { THCState *state = cutorch_getstate(L); if(lua_type(L, 1) != LUA_TNUMBER) { THError("Must receive a ptr cast as a number"); } void* ptr = (void* )luaL_optinteger(L, 1, 0); struct cudaPointerAttributes attributes; cudaError_t res = cudaPointerGetAttributes(&attributes, ptr); if (res == cudaErrorInvalidValue) { lua_pushboolean(L, 0); } else { THCudaCheck(res); lua_pushboolean(L, attributes.isManaged); } return 1; } static int cutorch_shutdown(lua_State *L) { THCState **state = (THCState **) lua_topointer(L, 1); THCudaShutdown(*state); THCState_free(*state); return 0; } static int cutorch_hasHalfInstructions(lua_State *L) { THCState *state = cutorch_getstate(L); #ifdef CUDA_HALF_TENSOR lua_pushboolean(L, THC_nativeHalfInstructions(state)); #else lua_pushboolean(L, 0); #endif return 1; } static int cutorch_hasFastHalfInstructions(lua_State *L) { THCState *state = cutorch_getstate(L); #ifdef CUDA_HALF_TENSOR lua_pushboolean(L, THC_fastHalfInstructions(state)); #else lua_pushboolean(L, 0); #endif return 1; } static int cutorch_sleep(lua_State *L) { THCState *state = cutorch_getstate(L); if (!luaT_checklong(L, 1)) { THError("expected number 'cycles'"); } THC_sleep(state, luaT_tolong(L, 1)); return 0; } static const struct luaL_Reg cutorch_stuff__ [] = { {"synchronize", cutorch_synchronize}, {"synchronizeAll", cutorch_synchronizeAll}, {"reserveBlasHandles", cutorch_reserveBlasHandles}, {"getNumBlasHandles", cutorch_getNumBlasHandles}, {"setBlasHandle", cutorch_setBlasHandle}, {"getBlasHandle", cutorch_getBlasHandle}, {"reserveStreams", cutorch_reserveStreams}, {"getNumStreams", cutorch_getNumStreams}, {"setStream", cutorch_setStream}, {"getStream", cutorch_getStream}, {"setDefaultStream", cutorch_setDefaultStream}, {"streamWaitFor", cutorch_streamWaitFor}, {"streamWaitForMultiDevice", cutorch_streamWaitForMultiDevice}, {"streamBarrier", cutorch_streamBarrier}, {"streamBarrierMultiDevice", cutorch_streamBarrierMultiDevice}, {"streamSynchronize", cutorch_streamSynchronize}, {"getDevice", cutorch_getDevice}, {"deviceReset", cutorch_deviceReset}, {"getDeviceCount", cutorch_getDeviceCount}, {"getPeerToPeerAccess", cutorch_getPeerToPeerAccess}, {"setPeerToPeerAccess", cutorch_setPeerToPeerAccess}, {"setKernelPeerToPeerAccess", cutorch_setKernelPeerToPeerAccess}, {"getKernelPeerToPeerAccess", cutorch_getKernelPeerToPeerAccess}, {"isCachingAllocatorEnabled", cutorch_isCachingAllocatorEnabled}, {"getDeviceProperties", cutorch_getDeviceProperties}, {"getRuntimeVersion", cutorch_getRuntimeVersion}, {"getDriverVersion", cutorch_getDriverVersion}, {"getMemoryUsage", cutorch_getMemoryUsage}, {"hasHalfInstructions", cutorch_hasHalfInstructions}, {"hasFastHalfInstructions", cutorch_hasFastHalfInstructions}, {"setDevice", cutorch_setDevice}, {"seed", cutorch_seed}, {"seedAll", cutorch_seedAll}, {"initialSeed", cutorch_initialSeed}, {"manualSeed", cutorch_manualSeed}, {"manualSeedAll", cutorch_manualSeedAll}, {"_sleep", cutorch_sleep}, {"getRNGState", cutorch_getRNGState}, {"setRNGState", cutorch_setRNGState}, {"getState", cutorch_getState}, {"setHeapTracking", cutorch_setHeapTracking}, {"isManagedPtr", cutorch_isManagedPtr}, {NULL, NULL} }; LUA_EXTERNC DLL_EXPORT int luaopen_libcutorch(lua_State *L); int luaopen_libcutorch(lua_State *L) { lua_newtable(L); lua_pushvalue(L, -1); lua_setglobal(L, "cutorch"); luaL_setfuncs(L, cutorch_stuff__, 0); THCState* state = THCState_alloc(); /* Enable the caching allocator unless THC_CACHING_ALLOCATOR=0 */ char* thc_caching_allocator = getenv("THC_CACHING_ALLOCATOR"); if (!thc_caching_allocator || strcmp(thc_caching_allocator, "0") != 0) { THCState_setDeviceAllocator(state, THCCachingAllocator_get()); state->cudaHostAllocator = &THCCachingHostAllocator; } THCudaInit(state); /* Register torch.CudaHostAllocator. */ luaT_pushudata(L, THCState_getCudaHostAllocator(state), "torch.Allocator"); lua_setfield(L, -2, "CudaHostAllocator"); /* Register torch.CudaUVAHostAllocator. */ luaT_pushudata(L, THCState_getCudaUVAAllocator(state), "torch.Allocator"); lua_setfield(L, -2, "CudaUVAAllocator"); #ifdef USE_MAGMA THCMagma_init(state); lua_pushboolean(L, 1); lua_setfield(L, -2, "magma"); #endif cutorch_CudaByteStorage_init(L); cutorch_CudaCharStorage_init(L); cutorch_CudaShortStorage_init(L); cutorch_CudaIntStorage_init(L); cutorch_CudaLongStorage_init(L); cutorch_CudaStorage_init(L); cutorch_CudaDoubleStorage_init(L); #ifdef CUDA_HALF_TENSOR cutorch_CudaHalfStorage_init(L); #else cutorch_HalfStorageCopy_init(L); #endif cutorch_CudaByteTensor_init(L); cutorch_CudaCharTensor_init(L); cutorch_CudaShortTensor_init(L); cutorch_CudaIntTensor_init(L); cutorch_CudaLongTensor_init(L); cutorch_CudaTensor_init(L); cutorch_CudaDoubleTensor_init(L); #ifdef CUDA_HALF_TENSOR cutorch_CudaHalfTensor_init(L); #else cutorch_HalfTensorCopy_init(L); #endif cutorch_CudaByteTensorOperator_init(L); cutorch_CudaCharTensorOperator_init(L); cutorch_CudaShortTensorOperator_init(L); cutorch_CudaIntTensorOperator_init(L); cutorch_CudaLongTensorOperator_init(L); cutorch_CudaTensorOperator_init(L); cutorch_CudaDoubleTensorOperator_init(L); #ifdef CUDA_HALF_TENSOR cutorch_CudaHalfTensorOperator_init(L); #endif cutorch_CudaByteTensorMath_init(L); cutorch_CudaCharTensorMath_init(L); cutorch_CudaShortTensorMath_init(L); cutorch_CudaIntTensorMath_init(L); cutorch_CudaLongTensorMath_init(L); cutorch_CudaTensorMath_init(L); cutorch_CudaDoubleTensorMath_init(L); #ifdef CUDA_HALF_TENSOR cutorch_CudaHalfTensorMath_init(L); #endif cutorch_Event_init(L); /* Store state in cutorch table. */ lua_pushlightuserdata(L, state); lua_setfield(L, -2, "_state"); #ifdef CUDA_HALF_TENSOR lua_pushboolean(L, 1); #else lua_pushboolean(L, 0); #endif lua_setfield(L, -2, "hasHalf"); /* store gpu driver version in field */ int driverVersion; THCudaCheck(cudaDriverGetVersion(&driverVersion)); lua_pushinteger(L, driverVersion); lua_setfield(L, -2, "driverVersion"); /* when cutorch goes out of scope, we need to make sure THCState is properly shut down (so that memory doesn not leak. Since _state is a lightuserdata we cannot associate an __gc method with it. Hence, create a userdata, and associate a metatable with it, which has an __gc method which properly calls THCudaShutdown. */ /* create a new userdata type which is a pointer to a pointer */ THCState **thc_pointer = (THCState**)lua_newuserdata(L, sizeof(void*)); /* set the state pointer */ *thc_pointer = state; /* create a table that will be used as the metatable */ lua_newtable(L); /* push the gc function onto the stack */ lua_pushcfunction(L, &cutorch_shutdown); /* set the __gc field in the table to the function (function is popped) */ lua_setfield(L, -2, "__gc"); /* now the table is on the top of the stack, and the userdata below it, setmetatable on the userdata with the table. table is popped */ lua_setmetatable(L, -2); /* now the userdata is on top, with the cutorch table below it, set the field cutorch.__stategc to this userdata. userdata is popped, leaving cutorch table on top of the stack */ lua_setfield(L, -2, "_stategc"); return 1; }