From cdb0b3b1dcd4e9962426422868b2f40535670a5c Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sun, 8 Oct 2017 04:32:25 +0200
Subject: Code refactor: use DeviceInfo to enable QBVH and decoupled volume
 shading.

---
 intern/cycles/device/device_cuda.cpp | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'intern/cycles/device/device_cuda.cpp')
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dcbe6033bcc..56a56c5217c 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2128,6 +2128,8 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
+		info.has_volume_decoupled = false;
+		info.has_qbvh = false;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
-- 
cgit v1.2.3


From e360d003ea45ee233c6f10c03ff57c956929b383 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sun, 8 Oct 2017 19:08:44 +0200
Subject: Cycles: schedule more work for non-display and compute preemption
 CUDA cards.

This change affects CUDA GPUs not connected to a display or connected to a
display but supporting compute preemption so that the display does not
freeze. I couldn't find an official list, but compute preemption seems to be
only supported with GTX 1070+ and Linux (not GTX 1060- or Windows).

This helps improve small tile rendering performance further if there are
sufficient samples x number of pixels in a single tile to keep the GPU busy.
---
 intern/cycles/device/device_cuda.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'intern/cycles/device/device_cuda.cpp')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 56a56c5217c..216c85f24e7 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1313,9 +1313,14 @@ public:
 		CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 
 		/* Prepare work size. More step samples render faster, but for now we
-		 * remain conservative to avoid driver timeouts. */
+		 * remain conservative for GPUs connected to a display to avoid driver
+		 * timeouts and display freezing. */
 		int min_blocks, num_threads_per_block;
 		cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+		if(!info.display_device) {
+			min_blocks *= 8;
+		}
+
 		uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);;
 
 		/* Render all samples. */
@@ -2109,7 +2114,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 	for(int num = 0; num < count; num++) {
 		char name[256];
-		int attr;
 
 		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
 			continue;
@@ -2141,14 +2145,21 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		                        (unsigned int)pci_location[1],
 		                        (unsigned int)pci_location[2]);
 
-		/* if device has a kernel timeout, assume it is used for display */
-		if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
+		/* If device has a kernel timeout and no compute preemption, we assume
+		 * it is connected to a display and will freeze the display while doing
+		 * computations. */
+		int timeout_attr = 0, preempt_attr = 0;
+		cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+		cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+		if(timeout_attr && !preempt_attr) {
 			info.description += " (Display)";
 			info.display_device = true;
 			display_devices.push_back(info);
 		}
-		else
+		else {
 			devices.push_back(info);
+		}
 	}
 
 	if(!display_devices.empty())
-- 
cgit v1.2.3