From e0ea53ae77c193ef08bb0b9215c5b3ffd84e7c11 Mon Sep 17 00:00:00 2001
From: Geraldine Chua <chua.gsk@gmail.com>
Date: Sun, 10 Jun 2018 23:15:29 +0800
Subject: Updates to volume kernel tiling function.

1. OpenCL and CUDA support (mostly untested).
2. Change name of offsets to grid_info since it needs to keep track of
other info as well.
3. Several speed and memory optimizations.
---
 .../cycles/kernel/kernels/cpu/kernel_cpu_image.h   | 108 +++++++++++----------
 .../cycles/kernel/kernels/cuda/kernel_cuda_image.h |  30 ++++++
 .../kernel/kernels/opencl/kernel_opencl_image.h    |  22 +++++
 3 files changed, 111 insertions(+), 49 deletions(-)

(limited to 'intern/cycles/kernel')
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index 7513efc6b15..c43b94db7e0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -75,26 +75,33 @@ template<typename T> struct TextureInterpolator  {
 		return read(data[y * width + x]);
 	}
 
-	static ccl_always_inline float4 read(const T *data, const int *offsets,
+	static ccl_always_inline float4 read(const T *data, const int *grid_info,
 	                                     int x, int y, int z,
-	                                     int width, int height, int depth,
-	                                     int tiw, int tih, int tid)
+	                                     int tiw, int tih, int ltw, int lth)
 	{
-		int index = compute_index(offsets, x, y, z,
-		                          width, height, depth, tiw, tih, tid);
-		return index < 0 ? make_float4(0.0f) : read(data[index]);
+		int tix = x / TILE_SIZE, itix = x % TILE_SIZE,
+		    tiy = y / TILE_SIZE, itiy = y % TILE_SIZE,
+		    tiz = z / TILE_SIZE, itiz = z % TILE_SIZE;
+		int dense_index = compute_index_fast(tix, tiy, tiz, tiw, tih) * 2;
+		int sparse_index = grid_info[dense_index];
+		int dims = grid_info[dense_index + 1];
+		if(sparse_index < 0) {
+			return make_float4(0.0f);
+		}
+		int itiw = dims & (1 << ST_SHIFT_TRUNCATE_WIDTH) ? ltw : TILE_SIZE;
+		int itih = dims & (1 << ST_SHIFT_TRUNCATE_HEIGHT) ? lth : TILE_SIZE;
+		int in_tile_index = compute_index_fast(itix, itiy, itiz, itiw, itih);
+		return read(data[sparse_index + in_tile_index]);
 	}
 
-	static ccl_always_inline float4 read(const T *data, const int *offsets,
-	                                     int idx, int width, int height, int depth)
+	static ccl_always_inline float4 read(const T *data, const int *grid_info,
+	                                     int index, int width, int height, int /*depth*/,
+	                                     int tiw, int tih, int ltw, int lth)
 	{
-		int3 c = compute_coordinates(idx, width, height, depth);
-		int index = compute_index(offsets, c.x, c.y, c.z,
-		                          width, height, depth,
-		                          get_tile_res(width),
-		                          get_tile_res(height),
-		                          get_tile_res(depth));
-		return index < 0 ? make_float4(0.0f) : read(data[index]);
+		int x = index % width;
+		int y = (index / width) % height;
+		int z = index / (width * height);
+		return read(data, grid_info, x, y, z, tiw, tih, ltw, lth);
 	}
 
 	static ccl_always_inline int wrap_periodic(int x, int width)
@@ -304,13 +311,14 @@ template<typename T> struct TextureInterpolator  {
 		}
 
 		const T *data = (const T*)info.data;
-		const int *ofs = (const int*)info.offsets;
+		const int *grid_info = (const int*)info.grid_info;
 
-		if(ofs) {
-			return read(data, ofs, ix, iy, iz, width, height, depth,
-			            get_tile_res(width), get_tile_res(height), get_tile_res(depth));
+		if(grid_info) {
+			return read(data, grid_info, ix, iy, iz,
+			            info.tiled_width, info.tiled_height,
+			            info.last_tile_width, info.last_tile_height);
 		}
-		return read(data[compute_index(ix, iy, iz, width, height, depth)]);
+		return read(data[compute_index_fast(ix, iy, iz, width, height)]);
 	}
 
 	static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info,
@@ -359,33 +367,31 @@ template<typename T> struct TextureInterpolator  {
 
 		float4 r;
 		const T *data = (const T*)info.data;
-		const int *ofs = (const int*)info.offsets;
-
-		if(ofs) {
-			int tiw = get_tile_res(width), tih = get_tile_res(height), tid = get_tile_res(depth);
-			/* Initial check if either voxel is in an active tile. */
-			if(!tile_is_active(ofs, ix, iy, iz, tiw, tih, tid) &&
-			   !tile_is_active(ofs, nix, niy, niz, tiw, tih, tid)) {
-				return make_float4(0.0f);
-			}
-			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx) * read(data, ofs, ix,  iy,  iz,  width, height, depth, tiw, tih, tid);
-			r += (1.0f - tz)*(1.0f - ty)*tx			 * read(data, ofs, nix, iy,  iz,  width, height, depth, tiw, tih, tid);
-			r += (1.0f - tz)*ty*(1.0f - tx)			 * read(data, ofs, ix,  niy, iz,  width, height, depth, tiw, tih, tid);
-			r += (1.0f - tz)*ty*tx					 * read(data, ofs, nix, niy, iz,  width, height, depth, tiw, tih, tid);
-			r += tz*(1.0f - ty)*(1.0f - tx)			 * read(data, ofs, ix,  iy,  niz, width, height, depth, tiw, tih, tid);
-			r += tz*(1.0f - ty)*tx					 * read(data, ofs, nix, iy,  niz, width, height, depth, tiw, tih, tid);
-			r += tz*ty*(1.0f - tx)					 * read(data, ofs, ix,  niy, niz, width, height, depth, tiw, tih, tid);
-			r += tz*ty*tx							 * read(data, ofs, nix, niy, niz, width, height, depth, tiw, tih, tid);
+		const int *gi = (const int*)info.grid_info;
+
+		if(gi) {
+			int tiw = info.tiled_width;
+			int tih = info.tiled_height;
+			int ltw = info.last_tile_width;
+			int lth = info.last_tile_height;
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx) * read(data, gi, ix,  iy,  iz,  tiw, tih, ltw, lth);
+			r += (1.0f - tz)*(1.0f - ty)*tx          * read(data, gi, nix, iy,  iz,  tiw, tih, ltw, lth);
+			r += (1.0f - tz)*ty*(1.0f - tx)          * read(data, gi, ix,  niy, iz,  tiw, tih, ltw, lth);
+			r += (1.0f - tz)*ty*tx                   * read(data, gi, nix, niy, iz,  tiw, tih, ltw, lth);
+			r += tz*(1.0f - ty)*(1.0f - tx)          * read(data, gi, ix,  iy,  niz, tiw, tih, ltw, lth);
+			r += tz*(1.0f - ty)*tx                   * read(data, gi, nix, iy,  niz, tiw, tih, ltw, lth);
+			r += tz*ty*(1.0f - tx)                   * read(data, gi, ix,  niy, niz, tiw, tih, ltw, lth);
+			r += tz*ty*tx                            * read(data, gi, nix, niy, niz, tiw, tih, ltw, lth);
 		}
 		else {
-			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx) * read(data[compute_index(ix,  iy,  iz,  width, height, depth)]);
-			r += (1.0f - tz)*(1.0f - ty)*tx			 * read(data[compute_index(nix, iy,  iz,  width, height, depth)]);
-			r += (1.0f - tz)*ty*(1.0f - tx)			 * read(data[compute_index(ix,  niy, iz,  width, height, depth)]);
-			r += (1.0f - tz)*ty*tx					 * read(data[compute_index(nix, niy, iz,  width, height, depth)]);
-			r += tz*(1.0f - ty)*(1.0f - tx)			 * read(data[compute_index(ix,  iy,  niz, width, height, depth)]);
-			r += tz*(1.0f - ty)*tx					 * read(data[compute_index(nix, iy,  niz, width, height, depth)]);
-			r += tz*ty*(1.0f - tx)					 * read(data[compute_index(ix,  niy, niz, width, height, depth)]);
-			r += tz*ty*tx							 * read(data[compute_index(nix, niy, niz, width, height, depth)]);
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx) * read(data[compute_index_fast(ix,  iy,  iz,  width, height)]);
+			r += (1.0f - tz)*(1.0f - ty)*tx			 * read(data[compute_index_fast(nix, iy,  iz,  width, height)]);
+			r += (1.0f - tz)*ty*(1.0f - tx)			 * read(data[compute_index_fast(ix,  niy, iz,  width, height)]);
+			r += (1.0f - tz)*ty*tx					 * read(data[compute_index_fast(nix, niy, iz,  width, height)]);
+			r += tz*(1.0f - ty)*(1.0f - tx)			 * read(data[compute_index_fast(ix,  iy,  niz, width, height)]);
+			r += tz*(1.0f - ty)*tx					 * read(data[compute_index_fast(nix, iy,  niz, width, height)]);
+			r += tz*ty*(1.0f - tx)					 * read(data[compute_index_fast(ix,  niy, niz, width, height)]);
+			r += tz*ty*tx							 * read(data[compute_index_fast(nix, niy, niz, width, height)]);
 		}
 
 		return r;
@@ -407,6 +413,10 @@ template<typename T> struct TextureInterpolator  {
 		int width = info.width;
 		int height = info.height;
 		int depth = info.depth;
+		int tiw = info.tiled_width;
+		int tih = info.tiled_height;
+		int ltw = info.last_tile_width;
+		int lth = info.last_tile_height;
 		int ix, iy, iz;
 		int nix, niy, niz;
 		/* Tricubic b-spline interpolation. */
@@ -476,9 +486,9 @@ template<typename T> struct TextureInterpolator  {
 		/* Some helper macro to keep code reasonable size,
 		 * let compiler to inline all the matrix multiplications.
 		 */
-#define DATA(x, y, z) (ofs ? \
-	    read(data, ofs, xc[x] + yc[y] + zc[z], width, height, depth) : \
-	    read(data[xc[x] + yc[y] + zc[z]]))
+#define DATA(x, y, z) (gi ? \
+		read(data, gi, xc[x] + yc[y] + zc[z], width, height, depth, tiw, tih, ltw, lth) : \
+		read(data[xc[x] + yc[y] + zc[z]]))
 #define COL_TERM(col, row) \
 		(v[col] * (u[0] * DATA(0, col, row) + \
 		           u[1] * DATA(1, col, row) + \
@@ -496,7 +506,7 @@ template<typename T> struct TextureInterpolator  {
 
 		/* Actual interpolation. */
 		const T *data = (const T*)info.data;
-		const int *ofs = (const int*)info.offsets;
+		const int *gi = (const int*)info.grid_info;
 		return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
 
 #undef COL_TERM
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index 91ad289a858..dcd1b33722a 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -58,6 +58,30 @@ ccl_device float cubic_h1(float a)
 	return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f;
 }
 
+ccl_device bool compute_sparse_coordinates(const TextureInfo *info, float fx, float fy, float fz)
+{
+	float *ix, *iy, *iz;
+	modff(fx, *ix);
+	modff(fy, *iy);
+	modff(fz, *iz);
+	int x = *ix, y = *iy, z = *iz;
+	int tix = x / TILE_SIZE, itix = x % TILE_SIZE,
+	    tiy = y / TILE_SIZE, itiy = y % TILE_SIZE,
+	    tiz = z / TILE_SIZE, itiz = z % TILE_SIZE;
+	int dense_index = (tix + info->tiled_width * (tiy + tiz * info->tiled_height)) * 4;
+	int tile_x = info->grid_info[dense_index];
+	if(tile_x < 0) {
+		return false;
+	}
+	int tile_y = info->grid_info[dense_index + 1];
+	int tile_z = info->grid_info[dense_index + 2];
+	int dims = info->grid_info[dense_index + 3];
+	fx += tile_x + itix + (dims & (1 << ST_SHIFT_X_LHS_PAD));
+	fy += tile_y + itiy + (dims & (1 << ST_SHIFT_Y_LHS_PAD));
+	fz += tile_z + itiz + (dims & (1 << ST_SHIFT_Z_LHS_PAD));
+	return true;
+}
+
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
 ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y)
@@ -161,6 +185,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 	CUtexObject tex = (CUtexObject)info.data;
 	uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp;
 
+	if(info.grid_info) {
+		if(!compute_sparse_coordinates(&info, x, y, z)) {
+			return make_float4(0.0f);
+		}
+	}
+
 	const int texture_type = kernel_tex_type(id);
 	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
 	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index faa9dd66d0e..2e0db6609f7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -67,6 +67,25 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_glo
 	}
 }
 
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg,
+                                                const ccl_global TextureInfo *info,
+                                                int id, int x, int y, int z)
+{
+	int tix = x / TILE_SIZE, itix = x % TILE_SIZE,
+	    tiy = y / TILE_SIZE, itiy = y % TILE_SIZE,
+	    tiz = z / TILE_SIZE, itiz = z % TILE_SIZE;
+	int dense_index = (tix + info->tiled_width * (tiy + tiz * info->tiled_height)) * 2;
+	int sparse_index = info->grid_info[dense_index];
+	int dims = info->grid_info[dense_index + 1];
+	if(sparse_index < 0) {
+		return make_float4(0.0f);
+	}
+	int itiw = dims & (1 << ST_SHIFT_TRUNCATE_WIDTH) ? info->last_tile_width : TILE_SIZE;
+	int itih = dims & (1 << ST_SHIFT_TRUNCATE_HEIGHT) ? info->last_tile_height : TILE_SIZE;
+	int in_tile_index = itix + itiw * (itiy + itih * itiz);
+	return svm_image_texture_read(kg, info, id, sparse_index + in_tile_index);
+}
+
 ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
 {
 	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
@@ -101,6 +120,9 @@ ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, in
 		z = svm_image_texture_wrap_clamp(z, info->depth);
 	}
 
+	if(info->grid_info) {
+		return svm_image_texture_read(kg, info, id, x, y, z);
+	}
 	int offset = x + info->width * y + info->width * info->height * z;
 	return svm_image_texture_read(kg, info, id, offset);
 }
-- 
cgit v1.2.3