diff options
author | Martin Raison <raison@fb.com> | 2017-03-12 20:20:49 +0300 |
---|---|---|
committer | soumith <soumith@fb.com> | 2017-04-21 00:30:14 +0300 |
commit | c3f2db3e217d875c3625ed85261460159dcbee49 (patch) | |
tree | f37a6e3d36551aca848876783d2778a161d8ceb9 | |
parent | 76dc265a754c066b53aeb78f52598338846b72aa (diff) |
create and expose handles for cusparse
-rw-r--r-- | lib/THC/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/THC/THCGeneral.c | 140 | ||||
-rw-r--r-- | lib/THC/THCGeneral.h.in | 25 |
3 files changed, 163 insertions, 3 deletions
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index 0117b02..29d3bf6 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -289,6 +289,7 @@ INSTALL(FILES THCNumerics.cuh THCTensorSort.cuh THCTensorInfo.cuh + THCTensorMathPointwise.cuh THCTensorTypeUtils.cuh THCTensorRandom.cuh THCTensorMathMagma.cuh diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 09bb43f..8b72b5b 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -75,6 +75,7 @@ void THCudaInit(THCState* state) state->currentStreams[i] = THCThreadLocal_alloc(); } state->currentPerDeviceBlasHandle = THCThreadLocal_alloc(); + state->currentPerDeviceSparseHandle = THCThreadLocal_alloc(); state->resourcesPerDevice = (THCCudaResourcesPerDevice*) malloc(numDevices * sizeof(THCCudaResourcesPerDevice)); @@ -131,6 +132,7 @@ void THCudaInit(THCState* state) // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS // handles are created lazily. state->numUserBlasHandles = 1; + state->numUserSparseHandles = 1; state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically state->heapDelta = 0; @@ -166,6 +168,10 @@ void THCudaShutdown(THCState* state) for (int i = 0; i < res->numBlasHandles; ++i) { THCublasCheck(cublasDestroy(res->blasHandles[i])); } + /* Free user defined sparse handles */ + for (int i = 0; i < res->numSparseHandles; ++i) { + THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); + } /* Free per-stream scratch space; starts at 0 because there is space for the default stream as well*/ if (res->devScratchSpacePerStream) { @@ -176,6 +182,7 @@ void THCudaShutdown(THCState* state) free(res->streams); free(res->blasHandles); + free(res->sparseHandles); free(res->devScratchSpacePerStream); THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev])); THCThreadLocal_free(state->currentStreams[dev]); @@ -392,6 +399,29 @@ void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasH THCudaCheck(cudaSetDevice(prevDev)); } +void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles) +{ + int prevDev = -1; + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + if (numSparseHandles <= res->numSparseHandles) { + return; + } + + THCudaCheck(cudaGetDevice(&prevDev)); + THCudaCheck(cudaSetDevice(device)); + + size_t size = numSparseHandles * sizeof(cusparseHandle_t); + cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size); + for (int i = res->numSparseHandles; i < numSparseHandles; ++i) { + handles[i] = NULL; + THCusparseCheck(cusparseCreate(&handles[i])); + } + res->sparseHandles = handles; + res->numSparseHandles = numSparseHandles; + + THCudaCheck(cudaSetDevice(prevDev)); +} + void THCState_reserveBlasHandles(THCState* state, int numBlasHandles) { // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle @@ -402,6 +432,16 @@ void THCState_reserveBlasHandles(THCState* state, int numBlasHandles) } } +void THCState_reserveSparseHandles(THCState* state, int numSparseHandles) +{ + // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle + // to avoid initializing unused devices + if (numSparseHandles > state->numUserSparseHandles) + { + state->numUserSparseHandles = numSparseHandles; + } +} + int THCState_getNumStreams(THCState* state) { return state->numUserStreams; @@ -412,6 +452,11 @@ int THCState_getNumBlasHandles(THCState* state) return state->numUserBlasHandles; } +int THCState_getNumSparseHandles(THCState* state) +{ + return state->numUserSparseHandles; +} + THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( THCState *state, int device) { @@ -446,6 +491,17 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han return res->blasHandles[handle - 1]; } +cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle) +{ + if (handle <= 0 || handle > state->numUserSparseHandles) { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserSparseHandles); + } + THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device); + THCState_reserveDeviceSparseHandles(state, device, handle); + return res->sparseHandles[handle - 1]; +} + static THCStream* THCState_getStreamOnDevice(THCState* state, int device) { THCThreadLocal local = state->currentStreams[device]; @@ -509,6 +565,22 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) return NULL; } +cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state) +{ + /* This is called at the point of kernel execution. + For some debugging code or improperly instrumented kernels, + `state` is null */ + if (state) { + int device; + THCudaCheck(cudaGetDevice(&device)); + + int handle = THCState_getCurrentSparseHandleIndex(state); + return THCState_getDeviceSparseHandle(state, device, handle); + } + THError("THCState and sparseHandles must be set as there is no default sparseHandle"); + return NULL; +} + int THCState_getCurrentStreamIndex(THCState *state) { THCStream* stream = THCState_getStream(state); @@ -534,6 +606,15 @@ int THCState_getCurrentBlasHandleIndex(THCState *state) return (int) (intptr_t) value; } +int THCState_getCurrentSparseHandleIndex(THCState *state) +{ + void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle); + if (value == NULL) { + return 1; + } + return (int) (intptr_t) value; +} + THCStream* THCState_getStream(THCState *state) { int device; @@ -572,6 +653,16 @@ void THCState_setCurrentBlasHandleIndex(THCState *state, int handle) THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle); } +void THCState_setCurrentSparseHandleIndex(THCState *state, int handle) +{ + if (handle > state->numUserSparseHandles || handle <= 0) + { + THError("%d is not a valid handle, valid range is: (1, %d)", + handle, state->numUserSparseHandles); + } + THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle); +} + void* THCState_getCurrentDeviceScratchSpace(THCState* state) { int device = -1; @@ -676,6 +767,55 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line) } } +void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line) +{ + if(status != CUSPARSE_STATUS_SUCCESS) + { + const char* errmsg = NULL; + + switch(status) + { + case CUSPARSE_STATUS_NOT_INITIALIZED: + errmsg = "library not initialized"; + break; + + case CUSPARSE_STATUS_ALLOC_FAILED: + errmsg = "resource allocation failed"; + break; + + case CUSPARSE_STATUS_INVALID_VALUE: + errmsg = "an invalid numeric value was used as an argument"; + break; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + errmsg = "an absent device architectural feature is required"; + break; + + case CUSPARSE_STATUS_MAPPING_ERROR: + errmsg = "an access to GPU memory space failed"; + break; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + errmsg = "the GPU program failed to execute"; + break; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + errmsg = "an internal operation failed"; + break; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + errmsg = "the matrix type is not supported by this function"; + break; + + default: + errmsg = "unknown error"; + break; + } + + _THError(file, line, "cusparse runtime error : %s", errmsg); + } +} + static ptrdiff_t heapSize = 0; // not thread-local static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6; static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6; diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 06930cf..d718f7e 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -9,6 +9,7 @@ #include "cuda.h" #include "cuda_runtime.h" #include "cublas_v2.h" +#include "cusparse.h" #cmakedefine USE_MAGMA @@ -57,8 +58,12 @@ typedef struct _THCCudaResourcesPerDevice { THCStream** streams; /* Number of materialized cuBLAS handles */ int numBlasHandles; + /* Number of materialized cuSparse handles */ + int numSparseHandles; /* cuBLAS handes are lazily initialized */ cublasHandle_t* blasHandles; + /* cuSparse handes are lazily initialized */ + cusparseHandle_t* sparseHandles; /* Size of scratch space per each stream on this device available */ size_t scratchSpacePerStream; /* Device-resident scratch space per stream, used for global memory @@ -72,9 +77,9 @@ struct THCState { struct THCRNGState* rngState; struct cudaDeviceProp* deviceProperties; /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL, - which specifies the per-device default stream. blasHandles do not have a - default and must be explicitly initialized. We always initialize 1 - blasHandle but we can use more. + which specifies the per-device default stream. blasHandles and + sparseHandles do not have a default and must be explicitly initialized. + We always initialize 1 blasHandle and 1 sparseHandle but we can use more. */ THCCudaResourcesPerDevice* resourcesPerDevice; /* Captured number of devices upon startup; convenience for bounds checking */ @@ -82,6 +87,7 @@ struct THCState { /* Number of Torch defined resources available, indices 1 ... numStreams */ int numUserStreams; int numUserBlasHandles; + int numUserSparseHandles; /* Allocator using cudaMallocHost. */ THAllocator* cudaHostAllocator; @@ -91,6 +97,9 @@ struct THCState { /* Index of the current selected BLAS handle. The actual BLAS handle used depends on the current device. */ THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle; + /* Index of the current selected sparse handle. The actual sparse handle used + depends on the current device. */ + THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle; /* Array of thread locals containing the current stream for each device */ THCThreadLocal* currentStreams; @@ -163,11 +172,19 @@ THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream); THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles); THC_API int THCState_getNumBlasHandles(THCState* state); +THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles); +THC_API int THCState_getNumSparseHandles(THCState* state); + THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle); THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state); THC_API int THCState_getCurrentBlasHandleIndex(THCState *state); THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle); +THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle); +THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state); +THC_API int THCState_getCurrentSparseHandleIndex(THCState *state); +THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle); + /* For the current device and stream, returns the allocated scratch space */ THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state); THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream); @@ -178,10 +195,12 @@ THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device); #define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__) #define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__) #define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__) +#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__) THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line); THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line); THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line); +THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line); THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size); THC_API cudaError_t THCudaFree(THCState *state, void *ptr); |