Cycles: Improve CUDA and OptiX error reporting in the viewport

This patch makes the infamous "Cancel" error in the viewport a thing of the past. Instead it now shows a more useful error message and streamlines the error handling process in CUDA. Reviewed By: brecht Differential Revision: https://developer.blender.org/D8008
author: Patrick Mours <pmours@nvidia.com> 2020-06-12 17:42:49 +0300
committer: Patrick Mours <pmours@nvidia.com> 2020-06-12 19:24:15 +0300
commit: b586f801fc921f9f420260fb3ff4f26cb6773157 (patch)
tree: 8370736aec591a2424c09aa35863a4ba3f92dae4 /intern
parent: 5dca72dfc924ff931ae46b35a6342beec87f9fc4 (diff)
4 files changed, 65 insertions, 85 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 9f31ed12cf4..1aa2fdd0967 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -100,11 +100,7 @@ class CUDADevice : public Device {
 
   virtual BVHLayoutMask get_bvh_layout_mask() const;
 
-  void cuda_error_documentation();
-
-  bool cuda_error_(CUresult result, const string &stmt);
-
-  void cuda_error_message(const string &message);
+  void set_error(const string &error) override;
 
   CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
 
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 64c7f5e7d34..7aa63ff48c3 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -135,8 +135,10 @@ BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
   return BVH_LAYOUT_BVH2;
 }
 
-void CUDADevice::cuda_error_documentation()
+void CUDADevice::set_error(const string &error)
 {
+  Device::set_error(error);
+
   if (first_error) {
     fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
     fprintf(stderr,
@@ -148,42 +150,13 @@ void CUDADevice::cuda_error_documentation()
 #  define cuda_assert(stmt) \
     { \
       CUresult result = stmt; \
-\
       if (result != CUDA_SUCCESS) { \
-        string message = string_printf( \
-            "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-        if (error_msg == "") \
-          error_msg = message; \
-        fprintf(stderr, "%s\n", message.c_str()); \
-        /*cuda_abort();*/ \
-        cuda_error_documentation(); \
+        const char *name = cuewErrorString(result); \
+        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
       } \
     } \
     (void)0
 
-bool CUDADevice::cuda_error_(CUresult result, const string &stmt)
-{
-  if (result == CUDA_SUCCESS)
-    return false;
-
-  string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-  cuda_error_documentation();
-  return true;
-}
-
-#  define cuda_error(stmt) cuda_error_(stmt, #  stmt)
-
-void CUDADevice::cuda_error_message(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-  cuda_error_documentation();
-}
-
 CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
     : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
 {
@@ -212,12 +185,19 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   functions.loaded = false;
 
   /* Intialize CUDA. */
-  if (cuda_error(cuInit(0)))
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
     return;
+  }
 
   /* Setup device and context. */
-  if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
     return;
+  }
 
   /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
    * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
@@ -235,8 +215,6 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   /* Create context. */
-  CUresult result;
-
   if (background) {
     result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
   }
@@ -249,8 +227,10 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
     }
   }
 
-  if (cuda_error_(result, "cuCtxCreate"))
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
     return;
+  }
 
   int major, minor;
   cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -280,10 +260,8 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat
 
   /* We only support sm_30 and above */
   if (major < 3) {
-    cuda_error_message(
-        string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
-                      major,
-                      minor));
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
     return false;
   }
 
@@ -319,13 +297,19 @@ bool CUDADevice::check_peer_access(Device *peer_device)
   // Enable peer access in both directions
   {
     const CUDAContextScope scope(this);
-    if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) {
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
       return false;
     }
   }
   {
     const CUDAContextScope scope(peer_device_cuda);
-    if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) {
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
       return false;
     }
   }
@@ -432,14 +416,14 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
 #  ifdef _WIN32
   if (!use_adaptive_compilation() && have_precompiled_kernels()) {
     if (major < 3) {
-      cuda_error_message(
-          string_printf("CUDA device requires compute capability 3.0 or up, "
-                        "found %d.%d. Your GPU is not supported.",
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
                         major,
                         minor));
     }
     else {
-      cuda_error_message(
+      set_error(
           string_printf("CUDA binary kernel for this graphics card compute "
                         "capability (%d.%d) not found.",
                         major,
@@ -452,7 +436,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
   /* Compile. */
   const char *const nvcc = cuewCompilerPath();
   if (nvcc == NULL) {
-    cuda_error_message(
+    set_error(
         "CUDA nvcc compiler not found. "
         "Install CUDA toolkit in default location.");
     return string();
@@ -504,7 +488,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
   command = "call " + command;
 #  endif
   if (system(command.c_str()) != 0) {
-    cuda_error_message(
+    set_error(
         "Failed to execute compilation command, "
         "see console for details.");
     return string();
@@ -512,7 +496,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu
 
   /* Verify if compilation succeeded */
   if (!path_exists(cubin)) {
-    cuda_error_message(
+    set_error(
         "CUDA kernel compilation failed, "
         "see console for details.");
     return string();
@@ -565,16 +549,19 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
   else
     result = CUDA_ERROR_FILE_NOT_FOUND;
 
-  if (cuda_error_(result, "cuModuleLoad"))
-    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
 
   if (path_read_text(filter_cubin, cubin_data))
     result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
   else
     result = CUDA_ERROR_FILE_NOT_FOUND;
 
-  if (cuda_error_(result, "cuModuleLoad"))
-    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
+                            filter_cubin.c_str(),
+                            cuewErrorString(result)));
 
   if (result == CUDA_SUCCESS) {
     reserve_local_memory(requested_features);
@@ -870,7 +857,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
 
   if (mem_alloc_result != CUDA_SUCCESS) {
     status = " failed, out of device and host memory";
-    cuda_assert(mem_alloc_result);
+    set_error("System is out of GPU and shared host memory");
   }
 
   if (mem.name) {
@@ -2458,14 +2445,10 @@ void CUDADevice::task_cancel()
 #  define cuda_assert(stmt) \
     { \
       CUresult result = stmt; \
-\
       if (result != CUDA_SUCCESS) { \
-        string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-        if (device->error_msg == "") \
-          device->error_msg = message; \
-        fprintf(stderr, "%s\n", message.c_str()); \
-        /*cuda_abort();*/ \
-        device->cuda_error_documentation(); \
+        const char *name = cuewErrorString(result); \
+        device->set_error( \
+            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
       } \
     } \
     (void)0
@@ -2647,14 +2630,15 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim
 SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
                                                                 const DeviceRequestedFeatures &)
 {
-  CUDAContextScope scope(device);
-  CUfunction func;
+  const CUDAContextScope scope(device);
 
-  cuda_assert(
-      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-  if (device->have_error()) {
-    device->cuda_error_message(
-        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+  CUfunction func;
+  const CUresult result = cuModuleGetFunction(
+      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
+  if (result != CUDA_SUCCESS) {
+    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
+                                    kernel_name.data(),
+                                    cuewErrorString(result)));
     return NULL;
   }
 
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index db04c13d083..fbf6a914744 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -70,7 +70,7 @@ struct KernelParams {
       if (res != CUDA_SUCCESS) { \
         const char *name; \
         cuGetErrorName(res, &name); \
-        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
         return; \
       } \
     } \
@@ -81,7 +81,7 @@ struct KernelParams {
       if (res != CUDA_SUCCESS) { \
         const char *name; \
         cuGetErrorName(res, &name); \
-        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
         return false; \
       } \
     } \
@@ -92,7 +92,7 @@ struct KernelParams {
       enum OptixResult res = stmt; \
       if (res != OPTIX_SUCCESS) { \
         const char *name = optixGetErrorName(res); \
-        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
         return; \
       } \
     } \
@@ -102,7 +102,7 @@ struct KernelParams {
       enum OptixResult res = stmt; \
       if (res != OPTIX_SUCCESS) { \
         const char *name = optixGetErrorName(res); \
-        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
         return false; \
       } \
     } \
@@ -322,12 +322,12 @@ class OptiXDevice : public CUDADevice {
 
     // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
     if (requested_features.use_baking) {
-      set_error("OptiX implementation does not support baking yet");
+      set_error("OptiX backend does not support baking yet");
       return false;
     }
     // Disable shader raytracing support for now, since continuation callables are slow
     if (requested_features.use_shader_raytrace) {
-      set_error("OptiX implementation does not support shader raytracing yet");
+      set_error("OptiX backend does not support 'Ambient Occlusion' and 'Bevel' shader nodes yet");
       return false;
     }
 
@@ -386,14 +386,14 @@ class OptiXDevice : public CUDADevice {
       if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
         if (!getenv("OPTIX_ROOT_DIR")) {
           set_error(
-              "OPTIX_ROOT_DIR environment variable not set, must be set with the path to the "
-              "Optix SDK in order to compile the Optix kernel on demand.");
+              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+              "the Optix SDK to be able to compile Optix kernels on demand).");
           return false;
         }
         ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
       }
       if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed loading OptiX kernel " + ptx_filename + ".");
+        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
         return false;
       }
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 7c50140ecfe..f5bfebbaf78 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -833,7 +833,7 @@ bool Session::load_kernels(bool lock_scene)
         message = "Failed loading render kernel, see console for errors";
 
       progress.set_error(message);
-      progress.set_status("Error", message);
+      progress.set_status(message);
       progress.set_update();
       return false;
     }
@@ -872,7 +872,7 @@ void Session::run()
 
   /* progress update */
   if (progress.get_cancel())
-    progress.set_status("Cancel", progress.get_cancel_message());
+    progress.set_status(progress.get_cancel_message());
   else
     progress.set_update();
 }
author	Patrick Mours <pmours@nvidia.com>	2020-06-12 17:42:49 +0300
committer	Patrick Mours <pmours@nvidia.com>	2020-06-12 19:24:15 +0300
commit	b586f801fc921f9f420260fb3ff4f26cb6773157 (patch)
tree	8370736aec591a2424c09aa35863a4ba3f92dae4 /intern
parent	5dca72dfc924ff931ae46b35a6342beec87f9fc4 (diff)