diff options
Diffstat (limited to 'intern/opensubdiv/opensubdiv_device_context_cuda.cc')
-rw-r--r-- | intern/opensubdiv/opensubdiv_device_context_cuda.cc | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/intern/opensubdiv/opensubdiv_device_context_cuda.cc b/intern/opensubdiv/opensubdiv_device_context_cuda.cc new file mode 100644 index 00000000000..81c52f5d6cf --- /dev/null +++ b/intern/opensubdiv/opensubdiv_device_context_cuda.cc @@ -0,0 +1,237 @@ +/* + * Adopted from OpenSubdiv with the following license: + * + * Copyright 2015 Pixar + * + * Licensed under the Apache License, Version 2.0 (the "Apache License") + * with the following modification; you may not use this file except in + * compliance with the Apache License and the following modification to it: + * Section 6. Trademarks. is deleted and replaced with: + * + * 6. Trademarks. This License does not grant permission to use the trade + * names, trademarks, service marks, or product names of the Licensor + * and its affiliates, except as required to comply with Section 4(c) of + * the License and to reproduce the content of the NOTICE file. + * + * You may obtain a copy of the Apache License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the Apache License with the above modification is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the Apache License for the specific + * language governing permissions and limitations under the Apache License. + */ + +#ifdef OPENSUBDIV_HAS_CUDA + +#ifdef _MSC_VER +# include "iso646.h" +#endif + +#include "opensubdiv_device_context_cuda.h" + +#if defined(_WIN32) +# include <windows.h> +#elif defined(__APPLE__) +# include <OpenGL/OpenGL.h> +#else +# include <X11/Xlib.h> +# include <GL/glx.h> +#endif + +#include <cstdio> +#include <algorithm> +#include <cuda.h> +#include <cuda_runtime_api.h> +#include <cuda_gl_interop.h> + +#define message(fmt, ...) +//#define message(fmt, ...) fprintf(stderr, fmt, __VA_ARGS__) +#define error(fmt, ...) fprintf(stderr, fmt, __VA_ARGS__) + +static int _GetCudaDeviceForCurrentGLContext() +{ + // Find and use the CUDA device for the current GL context + unsigned int interopDeviceCount = 0; + int interopDevices[1]; + cudaError_t status = cudaGLGetDevices(&interopDeviceCount, interopDevices, + 1, cudaGLDeviceListCurrentFrame); + if (status == cudaErrorNoDevice or interopDeviceCount != 1) { + message("CUDA no interop devices found.\n"); + return 0; + } + int device = interopDevices[0]; + +#if defined(_WIN32) + return device; + +#elif defined(__APPLE__) + return device; + +#else // X11 + Display * display = glXGetCurrentDisplay(); + int screen = DefaultScreen(display); + if (device != screen) { + error("The CUDA interop device (%d) does not match " + "the screen used by the current GL context (%d), " + "which may cause slow performance on systems " + "with multiple GPU devices.", device, screen); + } + message("CUDA init using device for current GL context: %d\n", device); + return device; +#endif +} + +/* From "NVIDIA GPU Computing SDK 4.2/C/common/inc/cutil_inline_runtime.h": */ + +/* Beginning of GPU Architecture definitions */ +inline int _ConvertSMVer2Cores_local(int major, int minor) +{ + /* Defines for GPU Architecture types (using the SM version to determine + * the # of cores per SM + */ + typedef struct { + int SM; /* 0xMm (hexidecimal notation), + * M = SM Major version, + * and m = SM minor version + */ + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { { 0x10, 8 }, /* Tesla Generation (SM 1.0) G80 class */ + { 0x11, 8 }, /* Tesla Generation (SM 1.1) G8x class */ + { 0x12, 8 }, /* Tesla Generation (SM 1.2) G9x class */ + { 0x13, 8 }, /* Tesla Generation (SM 1.3) GT200 class */ + { 0x20, 32 }, /* Fermi Generation (SM 2.0) GF100 class */ + { 0x21, 48 }, /* Fermi Generation (SM 2.1) GF10x class */ + { 0x30, 192}, /* Fermi Generation (SM 3.0) GK10x class */ + { -1, -1 } + }; + + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor); + return -1; +} +/* End of GPU Architecture definitions. */ + +/* This function returns the best GPU (with maximum GFLOPS) */ +inline int cutGetMaxGflopsDeviceId() +{ + int current_device = 0, sm_per_multiproc = 0; + int max_compute_perf = 0, max_perf_device = -1; + int device_count = 0, best_SM_arch = 0; + int compat_major, compat_minor; + + cuDeviceGetCount(&device_count); + /* Find the best major SM Architecture GPU device. */ + while (current_device < device_count) { + cuDeviceComputeCapability(&compat_major, &compat_minor, current_device); + if (compat_major > 0 && compat_major < 9999) { + best_SM_arch = std::max(best_SM_arch, compat_major); + } + current_device++; + } + + /* Find the best CUDA capable GPU device. */ + current_device = 0; + while (current_device < device_count) { + cuDeviceComputeCapability(&compat_major, &compat_minor, current_device); + if (compat_major == 9999 && compat_minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = _ConvertSMVer2Cores_local(compat_major, + compat_minor); + } + int multi_processor_count; + cuDeviceGetAttribute(&multi_processor_count, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + current_device); + int clock_rate; + cuDeviceGetAttribute(&clock_rate, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + current_device); + int compute_perf = multi_processor_count * sm_per_multiproc * clock_rate; + if (compute_perf > max_compute_perf) { + /* If we find GPU with SM major > 2, search only these */ + if (best_SM_arch > 2) { + /* If our device==dest_SM_arch, choose this, or else pass. */ + if (compat_major == best_SM_arch) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + ++current_device; + } + return max_perf_device; +} + +bool CudaDeviceContext::HAS_CUDA_VERSION_4_0() +{ +#ifdef OPENSUBDIV_HAS_CUDA + static bool cudaInitialized = false; + static bool cudaLoadSuccess = true; + if (!cudaInitialized) { + cudaInitialized = true; + +# ifdef OPENSUBDIV_HAS_CUEW + cudaLoadSuccess = cuewInit() == CUEW_SUCCESS; + if (!cudaLoadSuccess) { + fprintf(stderr, "Loading CUDA failed.\n"); + } +# endif + // Need to initialize CUDA here so getting device + // with the maximum FPLOS works fine. + if (cuInit(0) == CUDA_SUCCESS) { + // This is to deal with cases like NVidia Optimus, + // when there might be CUDA library installed but + // NVidia card is not being active. + if (cutGetMaxGflopsDeviceId() < 0) { + cudaLoadSuccess = false; + } + } + else { + cudaLoadSuccess = false; + } + } + return cudaLoadSuccess; +#else + return false; +#endif +} + +CudaDeviceContext::CudaDeviceContext() + : _initialized(false) { +} + +CudaDeviceContext::~CudaDeviceContext() { + cudaDeviceReset(); +} + +bool CudaDeviceContext::Initialize() +{ + /* See if any cuda device is available. */ + int deviceCount = 0; + cudaGetDeviceCount(&deviceCount); + message("CUDA device count: %d\n", deviceCount); + if (deviceCount <= 0) { + return false; + } + cudaGLSetGLDevice(_GetCudaDeviceForCurrentGLContext()); + _initialized = true; + return true; +} + +#endif /* OPENSUBDIV_HAS_CUDA */ |