diff options
Diffstat (limited to 'intern/opensubdiv/internal/opensubdiv_device_context_cuda.cc')
-rw-r--r-- | intern/opensubdiv/internal/opensubdiv_device_context_cuda.cc | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/intern/opensubdiv/internal/opensubdiv_device_context_cuda.cc b/intern/opensubdiv/internal/opensubdiv_device_context_cuda.cc new file mode 100644 index 00000000000..875f503b9ab --- /dev/null +++ b/intern/opensubdiv/internal/opensubdiv_device_context_cuda.cc @@ -0,0 +1,226 @@ +// Adopted from OpenSubdiv with the following license: +// +// Copyright 2015 Pixar +// +// Licensed under the Apache License, Version 2.0 (the "Apache License") +// with the following modification; you may not use this file except in +// compliance with the Apache License and the following modification to it: +// Section 6. Trademarks. is deleted and replaced with: +// +// 6. Trademarks. This License does not grant permission to use the trade +// names, trademarks, service marks, or product names of the Licensor +// and its affiliates, except as required to comply with Section 4(c) of +// the License and to reproduce the content of the NOTICE file. +// +// You may obtain a copy of the Apache License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the Apache License with the above modification is +// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the Apache License for the specific +// language governing permissions and limitations under the Apache License. + +#ifdef OPENSUBDIV_HAS_CUDA + +#ifdef _MSC_VER +# include <iso646.h> +#endif + +#include "opensubdiv_device_context_cuda.h" + +#if defined(_WIN32) +# include <windows.h> +#elif defined(__APPLE__) +# include <OpenGL/OpenGL.h> +#else +# include <GL/glx.h> +# include <X11/Xlib.h> +#endif + +#include <cuda.h> +#include <cuda_gl_interop.h> +#include <cuda_runtime_api.h> +#include <algorithm> +#include <cstdio> + +#define message(fmt, ...) +// #define message(fmt, ...) fprintf(stderr, fmt, __VA_ARGS__) +#define error(fmt, ...) fprintf(stderr, fmt, __VA_ARGS__) + +namespace { + +int getCudaDeviceForCurrentGLContext() { + // Find and use the CUDA device for the current GL context + unsigned int interop_device_count = 0; + int interopDevices[1]; + cudaError_t status = cudaGLGetDevices(&interop_device_count, + interopDevices, + 1, + cudaGLDeviceListCurrentFrame); + if (status == cudaErrorNoDevice || interop_device_count != 1) { + message("CUDA no interop devices found.\n"); + return 0; + } + int device = interopDevices[0]; +#if defined(_WIN32) + return device; +#elif defined(__APPLE__) + return device; +#else // X11 + Display* display = glXGetCurrentDisplay(); + int screen = DefaultScreen(display); + if (device != screen) { + error("The CUDA interop device (%d) does not match " + "the screen used by the current GL context (%d), " + "which may cause slow performance on systems " + "with multiple GPU devices.", + device, screen); + } + message("CUDA init using device for current GL context: %d\n", device); + return device; +#endif +} + +// Beginning of GPU Architecture definitions. +int convertSMVer2Cores_local(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), + // M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x10, 8}, // Tesla Generation (SM 1.0) G80 class. + {0x11, 8}, // Tesla Generation (SM 1.1) G8x class. + {0x12, 8}, // Tesla Generation (SM 1.2) G9x class. + {0x13, 8}, // Tesla Generation (SM 1.3) GT200 class. + {0x20, 32}, // Fermi Generation (SM 2.0) GF100 class. + {0x21, 48}, // Fermi Generation (SM 2.1) GF10x class. + {0x30, 192}, // Fermi Generation (SM 3.0) GK10x class. + {-1, -1}}; + int index = 0; + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + index++; + } + printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor); + return -1; +} + +// This function returns the best GPU (with maximum GFLOPS). +int cutGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_compute_perf = 0, max_perf_device = -1; + int device_count = 0, best_SM_arch = 0; + int compat_major, compat_minor; + cuDeviceGetCount(&device_count); + // Find the best major SM Architecture GPU device. + while (current_device < device_count) { + cuDeviceComputeCapability(&compat_major, &compat_minor, current_device); + if (compat_major > 0 && compat_major < 9999) { + best_SM_arch = std::max(best_SM_arch, compat_major); + } + current_device++; + } + // Find the best CUDA capable GPU device. + current_device = 0; + while (current_device < device_count) { + cuDeviceComputeCapability(&compat_major, &compat_minor, current_device); + if (compat_major == 9999 && compat_minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = convertSMVer2Cores_local(compat_major, compat_minor); + } + int multi_processor_count; + cuDeviceGetAttribute(&multi_processor_count, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + current_device); + int clock_rate; + cuDeviceGetAttribute(&clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + current_device); + int compute_perf = multi_processor_count * sm_per_multiproc * clock_rate; + if (compute_perf > max_compute_perf) { + /* If we find GPU with SM major > 2, search only these */ + if (best_SM_arch > 2) { + /* If our device==dest_SM_arch, choose this, or else pass. */ + if (compat_major == best_SM_arch) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + ++current_device; + } + return max_perf_device; +} + +} // namespace + +bool CudaDeviceContext::HAS_CUDA_VERSION_4_0() { +#ifdef OPENSUBDIV_HAS_CUDA + static bool cuda_initialized = false; + static bool cuda_load_success = true; + if (!cuda_initialized) { + cuda_initialized = true; + +#ifdef OPENSUBDIV_HAS_CUEW + cuda_load_success = cuewInit(CUEW_INIT_CUDA) == CUEW_SUCCESS; + if (!cuda_load_success) { + fprintf(stderr, "Loading CUDA failed.\n"); + } +#endif + // Need to initialize CUDA here so getting device + // with the maximum FPLOS works fine. + if (cuInit(0) == CUDA_SUCCESS) { + // This is to deal with cases like NVidia Optimus, + // when there might be CUDA library installed but + // NVidia card is not being active. + if (cutGetMaxGflopsDeviceId() < 0) { + cuda_load_success = false; + } + } else { + cuda_load_success = false; + } + } + return cuda_load_success; +#else + return false; +#endif +} + +CudaDeviceContext::CudaDeviceContext() + : initialized_(false) { +} + +CudaDeviceContext::~CudaDeviceContext() { + cudaDeviceReset(); +} + +bool CudaDeviceContext::Initialize() { + // See if any cuda device is available. + int device_count = 0; + cudaGetDeviceCount(&device_count); + message("CUDA device count: %d\n", device_count); + if (device_count <= 0) { + return false; + } + cudaGLSetGLDevice(getCudaDeviceForCurrentGLContext()); + initialized_ = true; + return true; +} + +bool CudaDeviceContext::IsInitialized() const { + return initialized_; +} + +#endif // OPENSUBDIV_HAS_CUDA |