diff options
-rw-r--r-- | extern/cuew/include/cuew.h | 74 | ||||
-rw-r--r-- | extern/cuew/src/cuew.c | 146 | ||||
-rw-r--r-- | intern/cycles/app/cycles_cubin_cc.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 2 | ||||
-rw-r--r-- | intern/opensubdiv/opensubdiv_device_context_cuda.cc | 2 |
5 files changed, 177 insertions, 51 deletions
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h index f5009d4f2c7..fa334678e54 100644 --- a/extern/cuew/include/cuew.h +++ b/extern/cuew/include/cuew.h @@ -24,10 +24,10 @@ extern "C" { #include <stdlib.h> /* Defines. */ -#define CUEW_VERSION_MAJOR 1 -#define CUEW_VERSION_MINOR 2 +#define CUEW_VERSION_MAJOR 2 +#define CUEW_VERSION_MINOR 0 -#define CUDA_VERSION 8000 +#define CUDA_VERSION 9010 #define CU_IPC_HANDLE_SIZE 64 #define CU_STREAM_LEGACY ((CUstream)0x1) #define CU_STREAM_PER_THREAD ((CUstream)0x2) @@ -37,6 +37,8 @@ extern "C" { #define CU_MEMHOSTREGISTER_PORTABLE 0x01 #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 #define CU_MEMHOSTREGISTER_IOMEMORY 0x04 +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 #define CUDA_ARRAY3D_LAYERED 0x01 #define CUDA_ARRAY3D_2DARRAY 0x01 #define CUDA_ARRAY3D_SURFACE_LDST 0x02 @@ -204,6 +206,7 @@ typedef enum CUstreamWaitValue_flags_enum { CU_STREAM_WAIT_VALUE_GEQ = 0x0, CU_STREAM_WAIT_VALUE_EQ = 0x1, CU_STREAM_WAIT_VALUE_AND = 0x2, + CU_STREAM_WAIT_VALUE_NOR = 0x3, CU_STREAM_WAIT_VALUE_FLUSH = (1 << 30), } CUstreamWaitValue_flags; @@ -215,6 +218,8 @@ typedef enum CUstreamWriteValue_flags_enum { typedef enum CUstreamBatchMemOpType_enum { CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, + CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, + CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3, } CUstreamBatchMemOpType; @@ -225,7 +230,7 @@ typedef union CUstreamBatchMemOpParams_union { CUdeviceptr address; union { cuuint32_t value; - cuuint64_t pad; + cuuint64_t value64; }; unsigned int flags; CUdeviceptr alias; @@ -235,7 +240,7 @@ typedef union CUstreamBatchMemOpParams_union { CUdeviceptr address; union { cuuint32_t value; - cuuint64_t pad; + cuuint64_t value64; }; unsigned int flags; CUdeviceptr alias; @@ -372,6 +377,12 @@ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, CU_DEVICE_ATTRIBUTE_MAX, } CUdevice_attribute; @@ -408,6 +419,8 @@ typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, CU_FUNC_ATTRIBUTE_MAX, } CUfunction_attribute; @@ -424,6 +437,12 @@ typedef enum CUsharedconfig_enum { CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02, } CUsharedconfig; +typedef enum CUshared_carveout_enum { + CU_SHAREDMEM_CARVEOUT_DEFAULT, + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0, +} CUshared_carveout; + typedef enum CUmemorytype_enum { CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_DEVICE = 0x02, @@ -475,10 +494,6 @@ typedef enum CUjit_option_enum { } CUjit_option; typedef enum CUjit_target_enum { - CU_TARGET_COMPUTE_10 = 10, - CU_TARGET_COMPUTE_11 = 11, - CU_TARGET_COMPUTE_12 = 12, - CU_TARGET_COMPUTE_13 = 13, CU_TARGET_COMPUTE_20 = 20, CU_TARGET_COMPUTE_21 = 21, CU_TARGET_COMPUTE_30 = 30, @@ -491,6 +506,9 @@ typedef enum CUjit_target_enum { CU_TARGET_COMPUTE_60 = 60, CU_TARGET_COMPUTE_61 = 61, CU_TARGET_COMPUTE_62 = 62, + CU_TARGET_COMPUTE_70 = 70, + CU_TARGET_COMPUTE_73 = 73, + CU_TARGET_COMPUTE_75 = 75, } CUjit_target; typedef enum CUjit_fallback_enum { @@ -585,6 +603,7 @@ typedef enum cudaError_enum { CUDA_ERROR_INVALID_PTX = 218, CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, CUDA_ERROR_INVALID_SOURCE = 300, CUDA_ERROR_FILE_NOT_FOUND = 301, CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, @@ -611,6 +630,7 @@ typedef enum cudaError_enum { CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, CUDA_ERROR_INVALID_PC = 718, CUDA_ERROR_LAUNCH_FAILED = 719, + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, CUDA_ERROR_NOT_PERMITTED = 800, CUDA_ERROR_NOT_SUPPORTED = 801, CUDA_ERROR_UNKNOWN = 999, @@ -813,6 +833,19 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { unsigned long long p2pToken; unsigned int vaSpaceToken; } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +typedef struct CUDA_LAUNCH_PARAMS_st { + CUfunction function; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void** kernelParams; +} CUDA_LAUNCH_PARAMS; typedef unsigned int GLenum; typedef unsigned int GLuint; typedef int GLint; @@ -845,6 +878,8 @@ typedef enum { } nvrtcResult; typedef struct _nvrtcProgram* nvrtcProgram; + + /* Function types. */ typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pStr); typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pStr); @@ -983,12 +1018,17 @@ typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent); typedef CUresult CUDAAPI tcuEventDestroy_v2(CUevent hEvent); typedef CUresult CUDAAPI tcuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd); typedef CUresult CUDAAPI tcuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); +typedef CUresult CUDAAPI tcuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); typedef CUresult CUDAAPI tcuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); +typedef CUresult CUDAAPI tcuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); typedef CUresult CUDAAPI tcuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams* paramArray, unsigned int flags); typedef CUresult CUDAAPI tcuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc); +typedef CUresult CUDAAPI tcuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra); +typedef CUresult CUDAAPI tcuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams); +typedef CUresult CUDAAPI tcuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags); typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes); @@ -1041,9 +1081,9 @@ typedef CUresult CUDAAPI tcuSurfObjectCreate(CUsurfObject* pSurfObject, const CU typedef CUresult CUDAAPI tcuSurfObjectDestroy(CUsurfObject surfObject); typedef CUresult CUDAAPI tcuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject); typedef CUresult CUDAAPI tcuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev); -typedef CUresult CUDAAPI tcuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); typedef CUresult CUDAAPI tcuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); typedef CUresult CUDAAPI tcuCtxDisablePeerAccess(CUcontext peerContext); +typedef CUresult CUDAAPI tcuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource); typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource); @@ -1217,12 +1257,17 @@ extern tcuEventSynchronize *cuEventSynchronize; extern tcuEventDestroy_v2 *cuEventDestroy_v2; extern tcuEventElapsedTime *cuEventElapsedTime; extern tcuStreamWaitValue32 *cuStreamWaitValue32; +extern tcuStreamWaitValue64 *cuStreamWaitValue64; extern tcuStreamWriteValue32 *cuStreamWriteValue32; +extern tcuStreamWriteValue64 *cuStreamWriteValue64; extern tcuStreamBatchMemOp *cuStreamBatchMemOp; extern tcuFuncGetAttribute *cuFuncGetAttribute; +extern tcuFuncSetAttribute *cuFuncSetAttribute; extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig; extern tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; extern tcuLaunchKernel *cuLaunchKernel; +extern tcuLaunchCooperativeKernel *cuLaunchCooperativeKernel; +extern tcuLaunchCooperativeKernelMultiDevice *cuLaunchCooperativeKernelMultiDevice; extern tcuFuncSetBlockShape *cuFuncSetBlockShape; extern tcuFuncSetSharedSize *cuFuncSetSharedSize; extern tcuParamSetSize *cuParamSetSize; @@ -1275,9 +1320,9 @@ extern tcuSurfObjectCreate *cuSurfObjectCreate; extern tcuSurfObjectDestroy *cuSurfObjectDestroy; extern tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc; extern tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer; -extern tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute; extern tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess; extern tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess; +extern tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute; extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; extern tcuGraphicsResourceGetMappedMipmappedArray *cuGraphicsResourceGetMappedMipmappedArray; @@ -1319,7 +1364,12 @@ enum { CUEW_ERROR_ATEXIT_FAILED = -2, }; -int cuewInit(void); +enum { + CUEW_INIT_CUDA = 1, + CUEW_INIT_NVRTC = 2 +}; + +int cuewInit(cuuint32_t flags); const char *cuewErrorString(CUresult result); const char *cuewCompilerPath(void); int cuewCompilerVersion(void); diff --git a/extern/cuew/src/cuew.c b/extern/cuew/src/cuew.c index b68dc597049..329dfbad3aa 100644 --- a/extern/cuew/src/cuew.c +++ b/extern/cuew/src/cuew.c @@ -207,12 +207,17 @@ tcuEventSynchronize *cuEventSynchronize; tcuEventDestroy_v2 *cuEventDestroy_v2; tcuEventElapsedTime *cuEventElapsedTime; tcuStreamWaitValue32 *cuStreamWaitValue32; +tcuStreamWaitValue64 *cuStreamWaitValue64; tcuStreamWriteValue32 *cuStreamWriteValue32; +tcuStreamWriteValue64 *cuStreamWriteValue64; tcuStreamBatchMemOp *cuStreamBatchMemOp; tcuFuncGetAttribute *cuFuncGetAttribute; +tcuFuncSetAttribute *cuFuncSetAttribute; tcuFuncSetCacheConfig *cuFuncSetCacheConfig; tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig; tcuLaunchKernel *cuLaunchKernel; +tcuLaunchCooperativeKernel *cuLaunchCooperativeKernel; +tcuLaunchCooperativeKernelMultiDevice *cuLaunchCooperativeKernelMultiDevice; tcuFuncSetBlockShape *cuFuncSetBlockShape; tcuFuncSetSharedSize *cuFuncSetSharedSize; tcuParamSetSize *cuParamSetSize; @@ -265,9 +270,9 @@ tcuSurfObjectCreate *cuSurfObjectCreate; tcuSurfObjectDestroy *cuSurfObjectDestroy; tcuSurfObjectGetResourceDesc *cuSurfObjectGetResourceDesc; tcuDeviceCanAccessPeer *cuDeviceCanAccessPeer; -tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute; tcuCtxEnablePeerAccess *cuCtxEnablePeerAccess; tcuCtxDisablePeerAccess *cuCtxDisablePeerAccess; +tcuDeviceGetP2PAttribute *cuDeviceGetP2PAttribute; tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource; tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray; tcuGraphicsResourceGetMappedMipmappedArray *cuGraphicsResourceGetMappedMipmappedArray; @@ -315,34 +320,25 @@ static DynamicLibrary dynamic_library_open_find(const char **paths) { return NULL; } -static void cuewExit(void) { - if(cuda_lib != NULL) { +/* Implementation function. */ +static void cuewCudaExit(void) { + if (cuda_lib != NULL) { /* Ignore errors. */ dynamic_library_close(cuda_lib); cuda_lib = NULL; } } -/* Implementation function. */ -int cuewInit(void) { +static int cuewCudaInit(void) { /* Library paths. */ #ifdef _WIN32 /* Expected in c:/windows/system or similar, no path needed. */ const char *cuda_paths[] = {"nvcuda.dll", NULL}; - const char *nvrtc_paths[] = {"nvrtc64_80.dll", "nvrtc64_90.dll", "nvrtc64_91.dll", NULL}; #elif defined(__APPLE__) /* Default installation path. */ const char *cuda_paths[] = {"/usr/local/cuda/lib/libcuda.dylib", NULL}; - const char *nvrtc_paths[] = {"/usr/local/cuda/lib/libnvrtc.dylib", NULL}; #else const char *cuda_paths[] = {"libcuda.so", NULL}; - const char *nvrtc_paths[] = {"libnvrtc.so", -# if defined(__x86_64__) || defined(_M_X64) - "/usr/local/cuda/lib64/libnvrtc.so", -#else - "/usr/local/cuda/lib/libnvrtc.so", -#endif - NULL}; #endif static int initialized = 0; static int result = 0; @@ -354,7 +350,7 @@ int cuewInit(void) { initialized = 1; - error = atexit(cuewExit); + error = atexit(cuewCudaExit); if (error) { result = CUEW_ERROR_ATEXIT_FAILED; return result; @@ -362,9 +358,7 @@ int cuewInit(void) { /* Load library. */ cuda_lib = dynamic_library_open_find(cuda_paths); - nvrtc_lib = dynamic_library_open_find(nvrtc_paths); - /* CUDA library is mandatory to have, while nvrtc might be missing. */ if (cuda_lib == NULL) { result = CUEW_ERROR_OPEN_FAILED; return result; @@ -521,12 +515,17 @@ int cuewInit(void) { CUDA_LIBRARY_FIND(cuEventDestroy_v2); CUDA_LIBRARY_FIND(cuEventElapsedTime); CUDA_LIBRARY_FIND(cuStreamWaitValue32); + CUDA_LIBRARY_FIND(cuStreamWaitValue64); CUDA_LIBRARY_FIND(cuStreamWriteValue32); + CUDA_LIBRARY_FIND(cuStreamWriteValue64); CUDA_LIBRARY_FIND(cuStreamBatchMemOp); CUDA_LIBRARY_FIND(cuFuncGetAttribute); + CUDA_LIBRARY_FIND(cuFuncSetAttribute); CUDA_LIBRARY_FIND(cuFuncSetCacheConfig); CUDA_LIBRARY_FIND(cuFuncSetSharedMemConfig); CUDA_LIBRARY_FIND(cuLaunchKernel); + CUDA_LIBRARY_FIND(cuLaunchCooperativeKernel); + CUDA_LIBRARY_FIND(cuLaunchCooperativeKernelMultiDevice); CUDA_LIBRARY_FIND(cuFuncSetBlockShape); CUDA_LIBRARY_FIND(cuFuncSetSharedSize); CUDA_LIBRARY_FIND(cuParamSetSize); @@ -579,9 +578,9 @@ int cuewInit(void) { CUDA_LIBRARY_FIND(cuSurfObjectDestroy); CUDA_LIBRARY_FIND(cuSurfObjectGetResourceDesc); CUDA_LIBRARY_FIND(cuDeviceCanAccessPeer); - CUDA_LIBRARY_FIND(cuDeviceGetP2PAttribute); CUDA_LIBRARY_FIND(cuCtxEnablePeerAccess); CUDA_LIBRARY_FIND(cuCtxDisablePeerAccess); + CUDA_LIBRARY_FIND(cuDeviceGetP2PAttribute); CUDA_LIBRARY_FIND(cuGraphicsUnregisterResource); CUDA_LIBRARY_FIND(cuGraphicsSubResourceGetMappedArray); CUDA_LIBRARY_FIND(cuGraphicsResourceGetMappedMipmappedArray); @@ -604,27 +603,99 @@ int cuewInit(void) { CUDA_LIBRARY_FIND(cuGLMapBufferObjectAsync_v2); CUDA_LIBRARY_FIND(cuGLUnmapBufferObjectAsync); + result = CUEW_SUCCESS; + return result; +} +static void cuewExitNvrtc(void) { if (nvrtc_lib != NULL) { - NVRTC_LIBRARY_FIND(nvrtcGetErrorString); - NVRTC_LIBRARY_FIND(nvrtcVersion); - NVRTC_LIBRARY_FIND(nvrtcCreateProgram); - NVRTC_LIBRARY_FIND(nvrtcDestroyProgram); - NVRTC_LIBRARY_FIND(nvrtcCompileProgram); - NVRTC_LIBRARY_FIND(nvrtcGetPTXSize); - NVRTC_LIBRARY_FIND(nvrtcGetPTX); - NVRTC_LIBRARY_FIND(nvrtcGetProgramLogSize); - NVRTC_LIBRARY_FIND(nvrtcGetProgramLog); - NVRTC_LIBRARY_FIND(nvrtcAddNameExpression); - NVRTC_LIBRARY_FIND(nvrtcGetLoweredName); + /* Ignore errors. */ + dynamic_library_close(nvrtc_lib); + nvrtc_lib = NULL; } +} + +static int cuewNvrtcInit(void) { + /* Library paths. */ +#ifdef _WIN32 + /* Expected in c:/windows/system or similar, no path needed. */ + const char *nvrtc_paths[] = {"nvrtc64_80.dll", "nvrtc64_90.dll", "nvrtc64_91.dll", NULL}; +#elif defined(__APPLE__) + /* Default installation path. */ + const char *nvrtc_paths[] = {"/usr/local/cuda/lib/libnvrtc.dylib", NULL}; +#else + const char *nvrtc_paths[] = {"libnvrtc.so", +# if defined(__x86_64__) || defined(_M_X64) + "/usr/local/cuda/lib64/libnvrtc.so", +#else + "/usr/local/cuda/lib/libnvrtc.so", +#endif + NULL}; +#endif + static int initialized = 0; + static int result = 0; + int error; + + if (initialized) { + return result; + } + + initialized = 1; + + error = atexit(cuewExitNvrtc); + if (error) { + result = CUEW_ERROR_ATEXIT_FAILED; + return result; + } + + /* Load library. */ + nvrtc_lib = dynamic_library_open_find(nvrtc_paths); + + if (nvrtc_lib == NULL) { + result = CUEW_ERROR_OPEN_FAILED; + return result; + } + + NVRTC_LIBRARY_FIND(nvrtcGetErrorString); + NVRTC_LIBRARY_FIND(nvrtcVersion); + NVRTC_LIBRARY_FIND(nvrtcCreateProgram); + NVRTC_LIBRARY_FIND(nvrtcDestroyProgram); + NVRTC_LIBRARY_FIND(nvrtcCompileProgram); + NVRTC_LIBRARY_FIND(nvrtcGetPTXSize); + NVRTC_LIBRARY_FIND(nvrtcGetPTX); + NVRTC_LIBRARY_FIND(nvrtcGetProgramLogSize); + NVRTC_LIBRARY_FIND(nvrtcGetProgramLog); + NVRTC_LIBRARY_FIND(nvrtcAddNameExpression); + NVRTC_LIBRARY_FIND(nvrtcGetLoweredName); result = CUEW_SUCCESS; return result; } + +int cuewInit(cuuint32_t flags) { + int result = CUEW_SUCCESS; + + if (flags & CUEW_INIT_CUDA) { + result = cuewCudaInit(); + if (result != CUEW_SUCCESS) { + return result; + } + } + + if (flags & CUEW_INIT_NVRTC) { + result = cuewNvrtcInit(); + if (result != CUEW_SUCCESS) { + return result; + } + } + + return result; +} + + const char *cuewErrorString(CUresult result) { - switch(result) { + switch (result) { case CUDA_SUCCESS: return "No errors"; case CUDA_ERROR_INVALID_VALUE: return "Invalid value"; case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory"; @@ -655,6 +726,7 @@ const char *cuewErrorString(CUresult result) { case CUDA_ERROR_INVALID_PTX: return "Invalid ptx"; case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: return "Invalid graphics context"; case CUDA_ERROR_NVLINK_UNCORRECTABLE: return "Nvlink uncorrectable"; + case CUDA_ERROR_JIT_COMPILER_NOT_FOUND: return "Jit compiler not found"; case CUDA_ERROR_INVALID_SOURCE: return "Invalid source"; case CUDA_ERROR_FILE_NOT_FOUND: return "File not found"; case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve"; @@ -681,6 +753,7 @@ const char *cuewErrorString(CUresult result) { case CUDA_ERROR_INVALID_ADDRESS_SPACE: return "Invalid address space"; case CUDA_ERROR_INVALID_PC: return "Invalid pc"; case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed"; + case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: return "Cooperative launch too large"; case CUDA_ERROR_NOT_PERMITTED: return "Not permitted"; case CUDA_ERROR_NOT_SUPPORTED: return "Not supported"; case CUDA_ERROR_UNKNOWN: return "Unknown error"; @@ -738,14 +811,16 @@ const char *cuewCompilerPath(void) { if (binpath) { path_join(binpath, executable, sizeof(nvcc), nvcc); - if (path_exists(nvcc)) + if (path_exists(nvcc)) { return nvcc; + } } for (i = 0; defaultpaths[i]; ++i) { path_join(defaultpaths[i], executable, sizeof(nvcc), nvcc); - if (path_exists(nvcc)) + if (path_exists(nvcc)) { return nvcc; + } } #ifndef _WIN32 @@ -756,9 +831,9 @@ const char *cuewCompilerPath(void) { int len = fread(buffer, 1, sizeof(buffer) - 1, handle); buffer[len] = '\0'; pclose(handle); - - if (buffer[0]) + if (buffer[0]) { return "nvcc"; + } } } #endif @@ -785,8 +860,9 @@ int cuewCompilerVersion(void) { char output[65536] = "\0"; char command[65536] = "\0"; - if (path == NULL) + if (path == NULL) { return 0; + } /* get --version output */ strncpy(command, path, sizeof(command)); diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp index 73d0cd5130c..3c83bf2dae3 100644 --- a/intern/cycles/app/cycles_cubin_cc.cpp +++ b/intern/cycles/app/cycles_cubin_cc.cpp @@ -162,7 +162,7 @@ bool link_ptxas(CompilationSettings &settings) if (settings.verbose) { ptx += " --verbose"; - printf(ptx.c_str()); + printf("%s\n", ptx.c_str()); } int pxresult = system(ptx.c_str()); @@ -186,7 +186,7 @@ bool init(CompilationSettings &settings) } #endif - int cuewresult = cuewInit(); + int cuewresult = cuewInit(CUEW_INIT_NVRTC); if(cuewresult != CUEW_SUCCESS) { fprintf(stderr, "Error: cuew init fialed (0x%x)\n\n", cuewresult); return false; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 18a83672a6d..dfedf922ca9 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -2449,7 +2449,7 @@ bool device_cuda_init(void) return result; initialized = true; - int cuew_result = cuewInit(); + int cuew_result = cuewInit(CUEW_INIT_CUDA); if(cuew_result == CUEW_SUCCESS) { VLOG(1) << "CUEW initialization succeeded"; if(CUDADevice::have_precompiled_kernels()) { diff --git a/intern/opensubdiv/opensubdiv_device_context_cuda.cc b/intern/opensubdiv/opensubdiv_device_context_cuda.cc index 81c52f5d6cf..46b66a6b35e 100644 --- a/intern/opensubdiv/opensubdiv_device_context_cuda.cc +++ b/intern/opensubdiv/opensubdiv_device_context_cuda.cc @@ -187,7 +187,7 @@ bool CudaDeviceContext::HAS_CUDA_VERSION_4_0() cudaInitialized = true; # ifdef OPENSUBDIV_HAS_CUEW - cudaLoadSuccess = cuewInit() == CUEW_SUCCESS; + cudaLoadSuccess = cuewInit(CUEW_INIT_CUDA) == CUEW_SUCCESS; if (!cudaLoadSuccess) { fprintf(stderr, "Loading CUDA failed.\n"); } |