Fix caching allocator when used from multiple Lua threads

Use a single, global THCCachingAllocator instance. Previously, each Lua thread had its own THCCachingAllocator instance. However, threads can share storages, which means a segment could be allocated from on THCCachingAllocator and freed on another, which breaks. Fixes #539
author: Sam Gross <sgross@fb.com> 2016-10-14 20:08:56 +0300
committer: Sam Gross <sgross@fb.com> 2016-10-14 20:08:56 +0300
commit: fa5739ea0ee26d6ecfc7b3ab1647b32f6fa91a39 (patch)
tree: 9091d630f884637277a2b94bb2ad01f4a4617db4
parent: 073ba88337ff7a584abc6ee11a0f06a0696d8a69 (diff)
6 files changed, 72 insertions, 53 deletions
diff --git a/README.md b/README.md
index abdc00e..7206b27 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,8 @@ Set the environment variable `THC_CACHING_ALLOCATOR=1` to enable the caching CUD
 
 By default, cutorch calls `cudaMalloc` and `cudaFree` when CUDA tensors are allocated and freed. This is expensive because `cudaFree` synchronizes the CPU with the GPU. Setting `THC_CACHING_ALLOCATOR=1` will cause cutorch to cache and re-use CUDA allocations to avoid synchronizations.
 
+With the caching memory allocator, allocations and frees should logically be considered "usages" of the memory segment associated with streams, just like kernel launches. The programmer must insert the proper synchronization if memory segments are used from multiple streams.
+
 ###`cutorch.*` API
 - `cutorch.synchronize()` : All of the CUDA API is asynchronous (barring a few functions), which means that you can queue up operations. To wait for the operations to finish, you can issue `cutorch.synchronize()` in your code, when the code waits for all GPU operations on the current GPU to finish. WARNING: synchronizes the CPU host with respect to the current device (as per `cutorch.getDevice()`) only.
 - `cutorch.synchronizeAll()` : Same as `cutorch.synchronize()` except synchronizes the CPU host with all visible GPU devices in the system. Equivalent to calling `cutorch.synchronize()` once per each device.
diff --git a/init.c b/init.c
index 787dd8d..076dceb 100644
--- a/init.c
+++ b/init.c
@@ -966,7 +966,7 @@ int luaopen_libcutorch(lua_State *L)
 
   char* thc_caching_allocator = getenv("THC_CACHING_ALLOCATOR");
   if (thc_caching_allocator && strcmp(thc_caching_allocator, "1") == 0) {
-    THCCachingAllocator_init(THCState_getDeviceAllocator(state));
+    THCState_setDeviceAllocator(state, THCCachingAllocator_get());
   }
 
   THCudaInit(state);
diff --git a/lib/THC/THCCachingAllocator.cpp b/lib/THC/THCCachingAllocator.cpp
index 93f8327..73b81f6 100644
--- a/lib/THC/THCCachingAllocator.cpp
+++ b/lib/THC/THCCachingAllocator.cpp
@@ -20,6 +20,13 @@
 // - Large (>1MB) and small allocation requestss are handled separately. Large
 //   allocation requests can be filled by a cudaMalloc call of the exact size.
 //   Small requests will allocate and split a 1MB buffer, if necessary.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+
 
 namespace {
 
@@ -78,6 +85,7 @@ struct THCCachingAllocator
       large_blocks(BlockComparator),
       small_blocks(BlockComparator) {}
 
+  /** allocates a block which is safe to use from the provided stream */
   cudaError_t malloc(void** devPtr, size_t size, cudaStream_t stream)
   {
     std::lock_guard<std::mutex> lock(mutex);
@@ -160,6 +168,22 @@ struct THCCachingAllocator
     return cudaSuccess;
   }
 
+  /** returns cached blocks to the system allocator */
+  cudaError_t emptyCache()
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    cudaError_t err = free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    err = free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
+    if (err != cudaSuccess) {
+      return err;
+    }
+    return cudaSuccess;
+  }
+
+  /** combine previously split blocks */
   void try_merge_blocks(Block* dst, Block* src, FreeBlocks& free_blocks)
   {
     if (!src || src->allocated) {
@@ -196,6 +220,8 @@ struct THCCachingAllocator
 
   cudaError_t cuda_malloc_retry(int device, void** devPtr, size_t size)
   {
+    // Try cudaMalloc. If cudaMalloc fails, frees all non-split cached blocks
+    // and retries.
     cudaError_t err = cudaMalloc(devPtr, size);
     if (err != cudaSuccess) {
       cudaGetLastError();
@@ -233,6 +259,7 @@ struct THCCachingAllocator
 
   cudaError_t free_blocks(FreeBlocks& blocks, FreeBlocks::iterator it, FreeBlocks::iterator end)
   {
+    // Frees all non-split blocks between `it` and `end`
     while (it != end) {
       Block* block = *it;
       if (!block->prev && !block->next) {
@@ -264,27 +291,21 @@ static cudaError_t THCCachingAllocator_free(void* ctx, void* ptr)
   return a->free(ptr);
 }
 
-static cudaError_t THCCachingAllocator_shutdown(void* ctx)
+static cudaError_t THCCachingAllocator_emptyCache(void* ctx)
 {
-  cudaError_t err;
   THCCachingAllocator* a = (THCCachingAllocator*) ctx;
-  err = a->free_blocks(a->large_blocks, a->large_blocks.begin(), a->large_blocks.end());
-  if (err != cudaSuccess) {
-    return err;
-  }
-  err = a->free_blocks(a->small_blocks, a->small_blocks.begin(), a->small_blocks.end());
-  if (err != cudaSuccess) {
-    return err;
-  }
-  delete a;
-  return cudaSuccess;
+  return a->emptyCache();
 }
 
-THC_API void THCCachingAllocator_init(THCDeviceAllocator* alloc)
+static THCCachingAllocator caching_allocator;
+static THCDeviceAllocator device_allocator = {
+  &THCCachingAllocator_malloc,
+  &THCCachingAllocator_free,
+  &THCCachingAllocator_emptyCache,
+  &caching_allocator
+};
+
+THC_API THCDeviceAllocator* THCCachingAllocator_get()
 {
-  THCCachingAllocator* allocator = new THCCachingAllocator();
-  alloc->state = allocator;
-  alloc->malloc = &THCCachingAllocator_malloc;
-  alloc->free = &THCCachingAllocator_free;
-  alloc->shutdown = &THCCachingAllocator_shutdown;
+  return &device_allocator;
 }
diff --git a/lib/THC/THCCachingAllocator.h b/lib/THC/THCCachingAllocator.h
index 60ff52c..711b1da 100644
--- a/lib/THC/THCCachingAllocator.h
+++ b/lib/THC/THCCachingAllocator.h
@@ -3,6 +3,6 @@
 
 #include "THCGeneral.h"
 
-THC_API void THCCachingAllocator_init(THCDeviceAllocator* alloc);
+THC_API THCDeviceAllocator* THCCachingAllocator_get();
 
 #endif
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index 5bcce19..7cc7818 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -38,7 +38,7 @@ struct THCState {
 
   /* Allocator using cudaMallocHost. */
   THAllocator* cudaHostAllocator;
-  THCDeviceAllocator cudaDeviceAllocator;
+  THCDeviceAllocator* cudaDeviceAllocator;
 
   /* Index of the current selected per-device resource. Actual CUDA resource
      changes based on the current device, since resources are per-device */
@@ -67,8 +67,6 @@ struct THCState {
 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device);
 
-static void THCState_initDefaultDeviceAllocator(THCDeviceAllocator* a);
-
 THCState* THCState_alloc()
 {
   THCState* state = (THCState*) malloc(sizeof(THCState));
@@ -81,10 +79,27 @@ void THCState_free(THCState* state)
   free(state);
 }
 
+static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
+{
+  return cudaMalloc(devPtr, size);
+}
+
+static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr)
+{
+  return cudaFree(devPtr);
+}
+
+static THCDeviceAllocator defaultDeviceAllocator = {
+  &cudaMallocWrapper,
+  &cudaFreeWrapper,
+  NULL,
+  NULL
+};
+
 void THCudaInit(THCState* state)
 {
-  if (!state->cudaDeviceAllocator.malloc) {
-    THCState_initDefaultDeviceAllocator(&state->cudaDeviceAllocator);
+  if (!state->cudaDeviceAllocator) {
+    state->cudaDeviceAllocator = &defaultDeviceAllocator;
   }
 
   int numDevices = 0;
@@ -188,7 +203,9 @@ void THCudaShutdown(THCState* state)
     free(state->resourcesPerDevice[dev].devScratchSpacePerStream);
   }
   free(state->resourcesPerDevice);
-  state->cudaDeviceAllocator.shutdown(state->cudaDeviceAllocator.state);
+  if (state->cudaDeviceAllocator->emptyCache) {
+    state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state);
+  }
   THCThreadLocal_free(state->currentPerDeviceStream);
   THCThreadLocal_free(state->currentPerDeviceBlasHandle);
 
@@ -329,12 +346,11 @@ THAllocator* THCState_getCudaHostAllocator(THCState* state)
   return state->cudaHostAllocator;
 }
 
-THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
+void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
 {
-  return &state->cudaDeviceAllocator;
+  state->cudaDeviceAllocator = allocator;
 }
 
-
 int THCState_getNumDevices(THCState *state)
 {
   return state->numDevices;
@@ -652,31 +668,11 @@ void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), vo
   state->cutorchGCData = data;
 }
 
-static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
-{
-  return cudaMalloc(devPtr, size);
-}
-
-static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr)
-{
-  return cudaFree(devPtr);
-}
-
-static cudaError_t noop(void* ctx) { return cudaSuccess; }
-
-static void THCState_initDefaultDeviceAllocator(THCDeviceAllocator* a)
-{
-  a->malloc = &cudaMallocWrapper;
-  a->free = &cudaFreeWrapper;
-  a->shutdown = &noop;
-  a->state = NULL;
-}
-
 cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
 {
   THCudaCheck(cudaGetLastError());
   cudaStream_t stream = THCState_getCurrentStream(state);
-  THCDeviceAllocator* allocator = &state->cudaDeviceAllocator;
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
   cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream);
   if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
     cudaGetLastError(); // reset OOM error
@@ -688,7 +684,7 @@ cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
 
 cudaError_t THCudaFree(THCState *state, void *ptr)
 {
-  THCDeviceAllocator* allocator = &state->cudaDeviceAllocator;
+  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
   return allocator->free(allocator->state, ptr);
 }
 
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index ce3ffc5..23b00af 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -41,7 +41,7 @@ struct THCRNGState;  /* Random number generator state. */
 typedef struct _THCDeviceAllocator {
    cudaError_t (*malloc)(void*, void**, size_t, cudaStream_t);
    cudaError_t (*free)(void*, void*);
-   cudaError_t (*shutdown)(void*);
+   cudaError_t (*emptyCache)(void*);
    void* state;
 } THCDeviceAllocator;
 
@@ -75,7 +75,7 @@ THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* sta
 
 THC_API struct THCRNGState* THCState_getRngState(THCState* state);
 THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
-THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state);
+THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
 
 THC_API void THCMagma_init(THCState *state);
author	Sam Gross <sgross@fb.com>	2016-10-14 20:08:56 +0300
committer	Sam Gross <sgross@fb.com>	2016-10-14 20:08:56 +0300
commit	fa5739ea0ee26d6ecfc7b3ab1647b32f6fa91a39 (patch)
tree	9091d630f884637277a2b94bb2ad01f4a4617db4
parent	073ba88337ff7a584abc6ee11a0f06a0696d8a69 (diff)