Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cutorch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Raison <raison@fb.com>2017-03-12 20:20:49 +0300
committersoumith <soumith@fb.com>2017-04-21 00:30:14 +0300
commitc3f2db3e217d875c3625ed85261460159dcbee49 (patch)
treef37a6e3d36551aca848876783d2778a161d8ceb9
parent76dc265a754c066b53aeb78f52598338846b72aa (diff)
create and expose handles for cusparse
-rw-r--r--lib/THC/CMakeLists.txt1
-rw-r--r--lib/THC/THCGeneral.c140
-rw-r--r--lib/THC/THCGeneral.h.in25
3 files changed, 163 insertions, 3 deletions
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 0117b02..29d3bf6 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -289,6 +289,7 @@ INSTALL(FILES
THCNumerics.cuh
THCTensorSort.cuh
THCTensorInfo.cuh
+ THCTensorMathPointwise.cuh
THCTensorTypeUtils.cuh
THCTensorRandom.cuh
THCTensorMathMagma.cuh
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index 09bb43f..8b72b5b 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -75,6 +75,7 @@ void THCudaInit(THCState* state)
state->currentStreams[i] = THCThreadLocal_alloc();
}
state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
+ state->currentPerDeviceSparseHandle = THCThreadLocal_alloc();
state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
@@ -131,6 +132,7 @@ void THCudaInit(THCState* state)
// cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
// handles are created lazily.
state->numUserBlasHandles = 1;
+ state->numUserSparseHandles = 1;
state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
state->heapDelta = 0;
@@ -166,6 +168,10 @@ void THCudaShutdown(THCState* state)
for (int i = 0; i < res->numBlasHandles; ++i) {
THCublasCheck(cublasDestroy(res->blasHandles[i]));
}
+ /* Free user defined sparse handles */
+ for (int i = 0; i < res->numSparseHandles; ++i) {
+ THCusparseCheck(cusparseDestroy(res->sparseHandles[i]));
+ }
/* Free per-stream scratch space; starts at 0 because there is space for
the default stream as well*/
if (res->devScratchSpacePerStream) {
@@ -176,6 +182,7 @@ void THCudaShutdown(THCState* state)
free(res->streams);
free(res->blasHandles);
+ free(res->sparseHandles);
free(res->devScratchSpacePerStream);
THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
THCThreadLocal_free(state->currentStreams[dev]);
@@ -392,6 +399,29 @@ void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasH
THCudaCheck(cudaSetDevice(prevDev));
}
+void THCState_reserveDeviceSparseHandles(THCState* state, int device, int numSparseHandles)
+{
+ int prevDev = -1;
+ THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+ if (numSparseHandles <= res->numSparseHandles) {
+ return;
+ }
+
+ THCudaCheck(cudaGetDevice(&prevDev));
+ THCudaCheck(cudaSetDevice(device));
+
+ size_t size = numSparseHandles * sizeof(cusparseHandle_t);
+ cusparseHandle_t* handles = (cusparseHandle_t*) realloc(res->sparseHandles, size);
+ for (int i = res->numSparseHandles; i < numSparseHandles; ++i) {
+ handles[i] = NULL;
+ THCusparseCheck(cusparseCreate(&handles[i]));
+ }
+ res->sparseHandles = handles;
+ res->numSparseHandles = numSparseHandles;
+
+ THCudaCheck(cudaSetDevice(prevDev));
+}
+
void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
{
// cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
@@ -402,6 +432,16 @@ void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
}
}
+void THCState_reserveSparseHandles(THCState* state, int numSparseHandles)
+{
+ // cuBLAS handles are created lazily from THCState_getDeviceSparseHandle
+ // to avoid initializing unused devices
+ if (numSparseHandles > state->numUserSparseHandles)
+ {
+ state->numUserSparseHandles = numSparseHandles;
+ }
+}
+
int THCState_getNumStreams(THCState* state)
{
return state->numUserStreams;
@@ -412,6 +452,11 @@ int THCState_getNumBlasHandles(THCState* state)
return state->numUserBlasHandles;
}
+int THCState_getNumSparseHandles(THCState* state)
+{
+ return state->numUserSparseHandles;
+}
+
THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
THCState *state, int device)
{
@@ -446,6 +491,17 @@ cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int han
return res->blasHandles[handle - 1];
}
+cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle)
+{
+ if (handle <= 0 || handle > state->numUserSparseHandles) {
+ THError("%d is not a valid handle, valid range is: (1, %d)",
+ handle, state->numUserSparseHandles);
+ }
+ THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
+ THCState_reserveDeviceSparseHandles(state, device, handle);
+ return res->sparseHandles[handle - 1];
+}
+
static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
{
THCThreadLocal local = state->currentStreams[device];
@@ -509,6 +565,22 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
return NULL;
}
+cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
+{
+ /* This is called at the point of kernel execution.
+ For some debugging code or improperly instrumented kernels,
+ `state` is null */
+ if (state) {
+ int device;
+ THCudaCheck(cudaGetDevice(&device));
+
+ int handle = THCState_getCurrentSparseHandleIndex(state);
+ return THCState_getDeviceSparseHandle(state, device, handle);
+ }
+ THError("THCState and sparseHandles must be set as there is no default sparseHandle");
+ return NULL;
+}
+
int THCState_getCurrentStreamIndex(THCState *state)
{
THCStream* stream = THCState_getStream(state);
@@ -534,6 +606,15 @@ int THCState_getCurrentBlasHandleIndex(THCState *state)
return (int) (intptr_t) value;
}
+int THCState_getCurrentSparseHandleIndex(THCState *state)
+{
+ void* value = THCThreadLocal_get(state->currentPerDeviceSparseHandle);
+ if (value == NULL) {
+ return 1;
+ }
+ return (int) (intptr_t) value;
+}
+
THCStream* THCState_getStream(THCState *state)
{
int device;
@@ -572,6 +653,16 @@ void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
}
+void THCState_setCurrentSparseHandleIndex(THCState *state, int handle)
+{
+ if (handle > state->numUserSparseHandles || handle <= 0)
+ {
+ THError("%d is not a valid handle, valid range is: (1, %d)",
+ handle, state->numUserSparseHandles);
+ }
+ THCThreadLocal_set(state->currentPerDeviceSparseHandle, (void*)(intptr_t)handle);
+}
+
void* THCState_getCurrentDeviceScratchSpace(THCState* state)
{
int device = -1;
@@ -676,6 +767,55 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
}
}
+void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line)
+{
+ if(status != CUSPARSE_STATUS_SUCCESS)
+ {
+ const char* errmsg = NULL;
+
+ switch(status)
+ {
+ case CUSPARSE_STATUS_NOT_INITIALIZED:
+ errmsg = "library not initialized";
+ break;
+
+ case CUSPARSE_STATUS_ALLOC_FAILED:
+ errmsg = "resource allocation failed";
+ break;
+
+ case CUSPARSE_STATUS_INVALID_VALUE:
+ errmsg = "an invalid numeric value was used as an argument";
+ break;
+
+ case CUSPARSE_STATUS_ARCH_MISMATCH:
+ errmsg = "an absent device architectural feature is required";
+ break;
+
+ case CUSPARSE_STATUS_MAPPING_ERROR:
+ errmsg = "an access to GPU memory space failed";
+ break;
+
+ case CUSPARSE_STATUS_EXECUTION_FAILED:
+ errmsg = "the GPU program failed to execute";
+ break;
+
+ case CUSPARSE_STATUS_INTERNAL_ERROR:
+ errmsg = "an internal operation failed";
+ break;
+
+ case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+ errmsg = "the matrix type is not supported by this function";
+ break;
+
+ default:
+ errmsg = "unknown error";
+ break;
+ }
+
+ _THError(file, line, "cusparse runtime error : %s", errmsg);
+ }
+}
+
static ptrdiff_t heapSize = 0; // not thread-local
static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index 06930cf..d718f7e 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -9,6 +9,7 @@
#include "cuda.h"
#include "cuda_runtime.h"
#include "cublas_v2.h"
+#include "cusparse.h"
#cmakedefine USE_MAGMA
@@ -57,8 +58,12 @@ typedef struct _THCCudaResourcesPerDevice {
THCStream** streams;
/* Number of materialized cuBLAS handles */
int numBlasHandles;
+ /* Number of materialized cuSparse handles */
+ int numSparseHandles;
/* cuBLAS handes are lazily initialized */
cublasHandle_t* blasHandles;
+ /* cuSparse handes are lazily initialized */
+ cusparseHandle_t* sparseHandles;
/* Size of scratch space per each stream on this device available */
size_t scratchSpacePerStream;
/* Device-resident scratch space per stream, used for global memory
@@ -72,9 +77,9 @@ struct THCState {
struct THCRNGState* rngState;
struct cudaDeviceProp* deviceProperties;
/* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
- which specifies the per-device default stream. blasHandles do not have a
- default and must be explicitly initialized. We always initialize 1
- blasHandle but we can use more.
+ which specifies the per-device default stream. blasHandles and
+ sparseHandles do not have a default and must be explicitly initialized.
+ We always initialize 1 blasHandle and 1 sparseHandle but we can use more.
*/
THCCudaResourcesPerDevice* resourcesPerDevice;
/* Captured number of devices upon startup; convenience for bounds checking */
@@ -82,6 +87,7 @@ struct THCState {
/* Number of Torch defined resources available, indices 1 ... numStreams */
int numUserStreams;
int numUserBlasHandles;
+ int numUserSparseHandles;
/* Allocator using cudaMallocHost. */
THAllocator* cudaHostAllocator;
@@ -91,6 +97,9 @@ struct THCState {
/* Index of the current selected BLAS handle. The actual BLAS handle used
depends on the current device. */
THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
+ /* Index of the current selected sparse handle. The actual sparse handle used
+ depends on the current device. */
+ THCThreadLocal/*<int>*/ currentPerDeviceSparseHandle;
/* Array of thread locals containing the current stream for each device */
THCThreadLocal* currentStreams;
@@ -163,11 +172,19 @@ THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);
THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
THC_API int THCState_getNumBlasHandles(THCState* state);
+THC_API void THCState_reserveSparseHandles(THCState* state, int numHandles);
+THC_API int THCState_getNumSparseHandles(THCState* state);
+
THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
+THC_API cusparseHandle_t THCState_getDeviceSparseHandle(THCState *state, int device, int handle);
+THC_API cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state);
+THC_API int THCState_getCurrentSparseHandleIndex(THCState *state);
+THC_API void THCState_setCurrentSparseHandleIndex(THCState *state, int handle);
+
/* For the current device and stream, returns the allocated scratch space */
THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
@@ -178,10 +195,12 @@ THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
#define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__)
#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)
+#define THCusparseCheck(err) __THCusparseCheck(err, __FILE__, __LINE__)
THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+THC_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
THC_API cudaError_t THCudaFree(THCState *state, void *ptr);