Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2017-10-15 18:40:01 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2017-10-15 18:46:50 +0300
commit2e50add1643d1f37dd9bd412348135477f1c3504 (patch)
tree24a1cd6fdbd6c57a366491c47f8b1fe92595ffc4 /intern
parent49f4ac17bf704614de59a4db7a65c205c085d694 (diff)
Fix OpenCL performance regression after cubic interpolation.
Reorganize code to reduce register pressure.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h2
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h353
2 files changed, 130 insertions, 225 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index b7be4fe4409..5ca07eaeb05 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -87,7 +87,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObjec
g1x * tex2D<T>(tex, x1, y1));
}
-/* Fast tricubic texture lookup using 8 bilinear lookups. */
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
template<typename T>
ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
{
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index d908af78c7a..faa9dd66d0e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -27,9 +27,21 @@ ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uin
#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+ x %= width;
+ if(x < 0)
+ x += width;
+ return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+ return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset)
{
- const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
const int texture_type = kernel_tex_type(id);
/* Float4 */
@@ -55,19 +67,45 @@ ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int o
}
}
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y)
{
- x %= width;
- if(x < 0)
- x += width;
- return x;
+ const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+ /* Wrap */
+ if(info->extension == EXTENSION_REPEAT) {
+ x = svm_image_texture_wrap_periodic(x, info->width);
+ y = svm_image_texture_wrap_periodic(y, info->height);
+ }
+ else {
+ x = svm_image_texture_wrap_clamp(x, info->width);
+ y = svm_image_texture_wrap_clamp(y, info->height);
+ }
+
+ int offset = x + info->width * y;
+ return svm_image_texture_read(kg, info, id, offset);
}
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z)
{
- return clamp(x, 0, width-1);
+ const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+ /* Wrap */
+ if(info->extension == EXTENSION_REPEAT) {
+ x = svm_image_texture_wrap_periodic(x, info->width);
+ y = svm_image_texture_wrap_periodic(y, info->height);
+ z = svm_image_texture_wrap_periodic(z, info->depth);
+ }
+ else {
+ x = svm_image_texture_wrap_clamp(x, info->width);
+ y = svm_image_texture_wrap_clamp(y, info->height);
+ z = svm_image_texture_wrap_clamp(z, info->depth);
+ }
+
+ int offset = x + info->width * y + info->width * info->height * z;
+ return svm_image_texture_read(kg, info, id, offset);
}
+
ccl_device_inline float svm_image_texture_frac(float x, int *ix)
{
int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
@@ -87,107 +125,52 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
{
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
- uint width = info->width;
- uint height = info->height;
- uint interpolation = info->interpolation;
- uint extension = info->extension;
+ if(info->extension == EXTENSION_CLIP) {
+ if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
- /* Actual sampling. */
- if(interpolation == INTERPOLATION_CLOSEST) {
+ if(info->interpolation == INTERPOLATION_CLOSEST) {
+ /* Closest interpolation. */
int ix, iy;
- svm_image_texture_frac(x*width, &ix);
- svm_image_texture_frac(y*height, &iy);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- /* Fall through. */
- /* EXTENSION_EXTEND */
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- }
+ svm_image_texture_frac(x*info->width, &ix);
+ svm_image_texture_frac(y*info->height, &iy);
- return svm_image_texture_read(kg, id, ix + iy*width);
+ return svm_image_texture_read_2d(kg, id, ix, iy);
+ }
+ else if(info->interpolation == INTERPOLATION_LINEAR) {
+ /* Bilinear interpolation. */
+ int ix, iy;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+
+ float4 r;
+ r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy);
+ r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy);
+ r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1);
+ r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1);
+ return r;
}
else {
- /* Bilinear or bicubic interpolation. */
- int ix, iy, nix, niy;
- float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
- float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- nix = svm_image_texture_wrap_periodic(ix+1, width);
- niy = svm_image_texture_wrap_periodic(iy+1, height);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- nix = svm_image_texture_wrap_clamp(ix+1, width);
- niy = svm_image_texture_wrap_clamp(iy+1, height);
- }
-
- if(interpolation == INTERPOLATION_LINEAR) {
- /* Bilinear interpolation. */
- float4 r;
- r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width);
- r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width);
- r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width);
- r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width);
- return r;
- }
-
/* Bicubic interpolation. */
- int pix, piy, nnix, nniy;
- if(extension == EXTENSION_REPEAT) {
- pix = svm_image_texture_wrap_periodic(ix-1, width);
- piy = svm_image_texture_wrap_periodic(iy-1, height);
- nnix = svm_image_texture_wrap_periodic(ix+2, width);
- nniy = svm_image_texture_wrap_periodic(iy+2, height);
- }
- else {
- pix = svm_image_texture_wrap_clamp(ix-1, width);
- piy = svm_image_texture_wrap_clamp(iy-1, height);
- nnix = svm_image_texture_wrap_clamp(ix+2, width);
- nniy = svm_image_texture_wrap_clamp(iy+2, height);
- }
+ int ix, iy;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
- const int xc[4] = {pix, ix, nix, nnix};
- const int yc[4] = {width * piy,
- width * iy,
- width * niy,
- width * nniy};
float u[4], v[4];
- /* Some helper macro to keep code reasonable size,
- * let compiler to inline all the matrix multiplications.
- */
-#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y]))
-#define TERM(col) \
- (v[col] * (u[0] * DATA(0, col) + \
- u[1] * DATA(1, col) + \
- u[2] * DATA(2, col) + \
- u[3] * DATA(3, col)))
-
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
- /* Actual interpolation. */
- return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-#undef TERM
-#undef DATA
+ float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ for(int y = 0; y < 4; y++) {
+ for(int x = 0; x < 4; x++) {
+ float weight = u[x]*v[y];
+ r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1);
+ }
+ }
+ return r;
}
}
@@ -196,145 +179,67 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
{
const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
- uint width = info->width;
- uint height = info->height;
- uint depth = info->depth;
+ if(info->extension == EXTENSION_CLIP) {
+ if(x < 0.0f || y < 0.0f || z < 0.0f ||
+ x > 1.0f || y > 1.0f || z > 1.0f)
+ {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
+
uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
- uint extension = info->extension;
- /* Actual sampling. */
if(interpolation == INTERPOLATION_CLOSEST) {
+ /* Closest interpolation. */
int ix, iy, iz;
- svm_image_texture_frac(x*width, &ix);
- svm_image_texture_frac(y*height, &iy);
- svm_image_texture_frac(z*depth, &iz);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- iz = svm_image_texture_wrap_periodic(iz, depth);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || z < 0.0f ||
- x > 1.0f || y > 1.0f || z > 1.0f)
- {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- /* Fall through. */
- /* EXTENSION_EXTEND */
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- iz = svm_image_texture_wrap_clamp(iz, depth);
- }
- return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+ svm_image_texture_frac(x*info->width, &ix);
+ svm_image_texture_frac(y*info->height, &iy);
+ svm_image_texture_frac(z*info->depth, &iz);
+
+ return svm_image_texture_read_3d(kg, id, ix, iy, iz);
+ }
+ else if(interpolation == INTERPOLATION_LINEAR) {
+ /* Bilinear interpolation. */
+ int ix, iy, iz;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+ float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
+
+ float4 r;
+ r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz);
+ r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz);
+ r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz);
+ r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz);
+
+ r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1);
+ r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1);
+ r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1);
+ r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1);
+ return r;
}
else {
- /* Bilinear or bicubic interpolation. */
- int ix, iy, iz, nix, niy, niz;
- float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
- float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
- float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
-
- if(extension == EXTENSION_REPEAT) {
- ix = svm_image_texture_wrap_periodic(ix, width);
- iy = svm_image_texture_wrap_periodic(iy, height);
- iz = svm_image_texture_wrap_periodic(iz, depth);
-
- nix = svm_image_texture_wrap_periodic(ix+1, width);
- niy = svm_image_texture_wrap_periodic(iy+1, height);
- niz = svm_image_texture_wrap_periodic(iz+1, depth);
- }
- else {
- if(extension == EXTENSION_CLIP) {
- if(x < 0.0f || y < 0.0f || z < 0.0f ||
- x > 1.0f || y > 1.0f || z > 1.0f)
- {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
- /* Fall through. */
- /* EXTENSION_EXTEND */
- nix = svm_image_texture_wrap_clamp(ix+1, width);
- niy = svm_image_texture_wrap_clamp(iy+1, height);
- niz = svm_image_texture_wrap_clamp(iz+1, depth);
-
- ix = svm_image_texture_wrap_clamp(ix, width);
- iy = svm_image_texture_wrap_clamp(iy, height);
- iz = svm_image_texture_wrap_clamp(iz, depth);
- }
-
- if(interpolation == INTERPOLATION_LINEAR) {
- /* Bilinear interpolation. */
- float4 r;
- r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
- r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + iz*width*height);
- r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + iz*width*height);
- r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + iz*width*height);
-
- r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + niz*width*height);
- r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + niz*width*height);
- r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + niz*width*height);
- r += tz*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + niz*width*height);
- return r;
- }
-
/* Bicubic interpolation. */
- int pix, piy, piz, nnix, nniy, nniz;
- if(extension == EXTENSION_REPEAT) {
- pix = svm_image_texture_wrap_periodic(ix-1, width);
- piy = svm_image_texture_wrap_periodic(iy-1, height);
- piz = svm_image_texture_wrap_periodic(iz-1, depth);
- nnix = svm_image_texture_wrap_periodic(ix+2, width);
- nniy = svm_image_texture_wrap_periodic(iy+2, height);
- nniz = svm_image_texture_wrap_periodic(iz+2, depth);
- }
- else {
- pix = svm_image_texture_wrap_clamp(ix-1, width);
- piy = svm_image_texture_wrap_clamp(iy-1, height);
- piz = svm_image_texture_wrap_clamp(iz-1, depth);
- nnix = svm_image_texture_wrap_clamp(ix+2, width);
- nniy = svm_image_texture_wrap_clamp(iy+2, height);
- nniz = svm_image_texture_wrap_clamp(iz+2, depth);
- }
+ int ix, iy, iz;
+ float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix);
+ float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy);
+ float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz);
- const int xc[4] = {pix, ix, nix, nnix};
- const int yc[4] = {width * piy,
- width * iy,
- width * niy,
- width * nniy};
- const int zc[4] = {width * height * piz,
- width * height * iz,
- width * height * niz,
- width * height * nniz};
float u[4], v[4], w[4];
-
- /* Some helper macro to keep code reasonable size,
- * let compiler to inline all the matrix multiplications.
- */
-#define DATA(x, y, z) (svm_image_texture_read(kg, id, xc[x] + yc[y] + zc[z]))
-#define COL_TERM(col, row) \
- (v[col] * (u[0] * DATA(0, col, row) + \
- u[1] * DATA(1, col, row) + \
- u[2] * DATA(2, col, row) + \
- u[3] * DATA(3, col, row)))
-#define ROW_TERM(row) \
- (w[row] * (COL_TERM(0, row) + \
- COL_TERM(1, row) + \
- COL_TERM(2, row) + \
- COL_TERM(3, row)))
-
SET_CUBIC_SPLINE_WEIGHTS(u, tx);
SET_CUBIC_SPLINE_WEIGHTS(v, ty);
SET_CUBIC_SPLINE_WEIGHTS(w, tz);
- /* Actual interpolation. */
- return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+ float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-#undef COL_TERM
-#undef ROW_TERM
-#undef DATA
+ for(int z = 0; z < 4; z++) {
+ for(int y = 0; y < 4; y++) {
+ for(int x = 0; x < 4; x++) {
+ float weight = u[x]*v[y]*w[z];
+ r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1);
+ }
+ }
+ }
+ return r;
}
}