diff options
author | Campbell Barton <ideasman42@gmail.com> | 2017-05-20 07:19:05 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2017-05-20 07:19:05 +0300 |
commit | 65aab6cdae822822122f5e180baf10e39f600bf2 (patch) | |
tree | fc8556ec5590a0e47c41e2c18131fede4b67732b /intern | |
parent | 996bf65730257d0a80c6ada03a38d0a321b1e87e (diff) | |
parent | 81e584ed17902878579131776b4e5a9f7b54cdab (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
22 files changed, 352 insertions, 247 deletions
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index 41998c792b6..53d703de143 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -28,7 +28,11 @@ pixel_buffer += buffer_w - (high.x - low.x); \ } -ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) { features[0] = pixel.x; features[1] = pixel.y; @@ -46,7 +50,11 @@ ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_rest } } -ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) { scales[0] = fabsf(pixel.x - mean[0]); scales[1] = fabsf(pixel.y - mean[1]); @@ -70,19 +78,21 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); } -ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +ccl_device_inline float3 filter_get_pixel_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) { return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); } -ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +ccl_device_inline float filter_get_pixel_variance(const ccl_global float *ccl_restrict buffer, + int pass_stride) { return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); } ccl_device_inline void design_row_add(float *design_row, int rank, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, int stride, int row, float feature) @@ -94,13 +104,13 @@ ccl_device_inline void design_row_add(float *design_row, /* Fill the design row. */ ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, - ccl_global float ccl_restrict_ptr p_buffer, + const ccl_global float *ccl_restrict p_buffer, int2 q_pixel, - ccl_global float ccl_restrict_ptr q_buffer, + const ccl_global float *ccl_restrict q_buffer, int pass_stride, int rank, float *design_row, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, int stride) { design_row[0] = 1.0f; diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index a242a8ed0a1..3185330994c 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -33,7 +33,12 @@ CCL_NAMESPACE_BEGIN pixel_buffer += buffer_w - (pixel.x - low.x); \ } -ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *features, + const __m128 *ccl_restrict mean, + int pass_stride) { features[0] = x; features[1] = y; @@ -53,7 +58,12 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active features[i] = _mm_mask_ps(features[i], active_pixels); } -ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride) +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *scales, + const __m128 *ccl_restrict mean, + int pass_stride) { scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 1a314b100be..5cb4038bc33 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,27 +16,39 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2) +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { float diff = 0.0f; int numChannels = channel_offset? 3 : 1; for(int c = 0; c < numChannels; c++) { - float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; - float pvar = varianceImage[c*channel_offset + y*w+x]; - float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { diff *= 1.0f/numChannels; } - differenceImage[y*w+x] = diff; + difference_image[y*w+x] = diff; } } } -ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) { #ifdef __KERNEL_SSE3__ int aligned_lowx = (rect.x & ~(3)); @@ -46,30 +58,34 @@ ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceI const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); for(int x = rect.x; x < rect.z; x++) { - outImage[y*w+x] = 0.0f; + out_image[y*w+x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { #ifdef __KERNEL_SSE3__ for(int x = aligned_lowx; x < aligned_highx; x+=4) { - _mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x))); + _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x))); } #else for(int x = rect.x; x < rect.z; x++) { - outImage[y*w+x] += differenceImage[y1*w+x]; + out_image[y*w+x] += difference_image[y1*w+x]; } #endif } for(int x = rect.x; x < rect.z; x++) { - outImage[y*w+x] *= 1.0f/(high - low); + out_image[y*w+x] *= 1.0f/(high - low); } } } -ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int w, + int f) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { - outImage[y*w+x] = 0.0f; + out_image[y*w+x] = 0.0f; } } for(int dx = -f; dx <= f; dx++) { @@ -77,7 +93,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff int neg_dx = min(0, dx); for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - outImage[y*w+x] += differenceImage[y*w+dx+x]; + out_image[y*w+x] += difference_image[y*w+dx+x]; } } } @@ -85,12 +101,19 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr diff for(int x = rect.x; x < rect.z; x++) { const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); - outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f)); + out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); } } } -ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f) +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int4 rect, + int w, + int f) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { @@ -98,18 +121,18 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*w+x1]; } float weight = sum * (1.0f/(high - low)); - accumImage[y*w+x] += weight; - outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + accum_image[y*w+x] += weight; + out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; } } } ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr buffer, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, float *color_pass, float *variance_pass, float *transform, @@ -130,7 +153,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*w+x1]; } float weight = sum * (1.0f/(high - low)); @@ -151,11 +174,14 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, } } -ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) +ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int4 rect, + int w) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { - outImage[y*w+x] /= accumImage[y*w+x]; + out_image[y*w+x] /= accum_image[y*w+x]; } } } diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index b5ba7cf51a5..078c5f56763 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr weightImage, - ccl_global float ccl_restrict_ptr varianceImage, - ccl_global float *differenceImage, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, int4 rect, int w, int channel_offset, float a, float k_2) @@ -28,78 +28,78 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, float diff = 0.0f; int numChannels = channel_offset? 3 : 1; for(int c = 0; c < numChannels; c++) { - float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; - float pvar = varianceImage[c*channel_offset + y*w+x]; - float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = variance_image[c*channel_offset + y*w+x]; + float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { diff *= 1.0f/numChannels; } - differenceImage[y*w+x] = diff; + difference_image[y*w+x] = diff; } ccl_device_inline void kernel_filter_nlm_blur(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, int4 rect, int w, int f) { float sum = 0.0f; const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); for(int y1 = low; y1 < high; y1++) { - sum += differenceImage[y1*w+x]; + sum += difference_image[y1*w+x]; } sum *= 1.0f/(high-low); - outImage[y*w+x] = sum; + out_image[y*w+x] = sum; } ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, int4 rect, int w, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*w+x1]; } sum *= 1.0f/(high-low); - outImage[y*w+x] = expf(-max(sum, 0.0f)); + out_image[y*w+x] = expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr image, - ccl_global float *outImage, - ccl_global float *accumImage, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, int4 rect, int w, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*w+x1]; } sum *= 1.0f/(high-low); - if(outImage) { - accumImage[y*w+x] += sum; - outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + if(out_image) { + accum_image[y*w+x] += sum; + out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; } else { - accumImage[y*w+x] = sum; + accum_image[y*w+x] = sum; } } ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, @@ -115,7 +115,7 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += differenceImage[y*w+x1]; + sum += difference_image[y*w+x1]; } float weight = sum * (1.0f/(high - low)); @@ -137,11 +137,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, } ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, - ccl_global float *outImage, - ccl_global float ccl_restrict_ptr accumImage, + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, int4 rect, int w) { - outImage[y*w+x] /= accumImage[y*w+x]; + out_image[y*w+x] /= accum_image[y*w+x]; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 252bcc5e675..82cc36625ec 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -44,7 +44,7 @@ ccl_device void kernel_filter_divide_shadow(int sample, int offset = tiles->offsets[tile]; int stride = tiles->strides[tile]; - ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; center_buffer += (y*stride + x + offset)*buffer_pass_stride; center_buffer += buffer_denoising_offset + 14; diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 6a7c86e4012..dc90f318570 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -21,10 +21,10 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int dx, int dy, int w, int h, int pass_stride, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, float weight, ccl_global float *XtWX, @@ -85,9 +85,17 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, const int stride = storage_stride; #endif + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues, + * fall back to this value. */ + float3 mean_color = XtWY[0]/XtWX[0]; + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); float3 final_color = XtWY[0]; + if(!isfinite3_safe(final_color)) { + final_color = mean_color; + } ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; final_color *= sample; diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index 139dc402d21..a5f87c05ec0 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, float *transform, int *rank, @@ -29,20 +29,15 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, /* Temporary storage, used in different steps of the algorithm. */ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; float tempvector[2*DENOISE_FEATURES]; - float ccl_restrict_ptr pixel_buffer; + const float *ccl_restrict pixel_buffer; int2 pixel; - - - /* === Calculate denoising window. === */ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - - - + int num_pixels = (high.y - low.y) * (high.x - low.x); /* === Shift feature passes to have mean 0. === */ float feature_means[DENOISE_FEATURES]; @@ -52,8 +47,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, math_vector_add(feature_means, features, DENOISE_FEATURES); } END_FOR_PIXEL_WINDOW - float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); - math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float *feature_scale = tempvector; @@ -66,7 +60,6 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, filter_calculate_scale(feature_scale); - /* === Generate the feature transformation. === * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ @@ -80,6 +73,8 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; for(int i = 0; i < DENOISE_FEATURES; i++) { @@ -88,7 +83,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, threshold_energy *= 1.0f - (-pca_threshold); float reduced_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; float s = feature_matrix[i*DENOISE_FEATURES+i]; @@ -96,7 +91,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, } } else { - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { float s = feature_matrix[i*DENOISE_FEATURES+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 68304e14143..83a1222bbdb 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, ccl_global float *transform, @@ -38,7 +38,8 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - ccl_global float ccl_restrict_ptr pixel_buffer; + int num_pixels = (high.y - low.y) * (high.x - low.x); + const ccl_global float *ccl_restrict pixel_buffer; int2 pixel; @@ -52,8 +53,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ math_vector_add(feature_means, features, DENOISE_FEATURES); } END_FOR_PIXEL_WINDOW - float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); - math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ float feature_scale[DENOISE_FEATURES]; @@ -81,6 +81,8 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; for(int i = 0; i < DENOISE_FEATURES; i++) { @@ -89,7 +91,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ threshold_energy *= 1.0f - (-pca_threshold); float reduced_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; float s = feature_matrix[i*DENOISE_FEATURES+i]; @@ -97,7 +99,7 @@ ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ } } else { - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { float s = feature_matrix[i*DENOISE_FEATURES+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index ed3a92f6241..30dc2969b11 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, int x, int y, int4 rect, int pass_stride, float *transform, int *rank, @@ -25,13 +25,14 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, int buffer_w = align_up(rect.z - rect.x, 4); __m128 features[DENOISE_FEATURES]; - float ccl_restrict_ptr pixel_buffer; + const float *ccl_restrict pixel_buffer; int2 pixel; int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); __m128 feature_means[DENOISE_FEATURES]; math_vector_zero_sse(feature_means, DENOISE_FEATURES); @@ -40,7 +41,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, math_vector_add_sse(feature_means, DENOISE_FEATURES, features); } END_FOR_PIXEL_WINDOW_SSE - __m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x))); + __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels); for(int i = 0; i < DENOISE_FEATURES; i++) { feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); } @@ -68,6 +69,8 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); if(pca_threshold < 0.0f) { float threshold_energy = 0.0f; for(int i = 0; i < DENOISE_FEATURES; i++) { @@ -76,7 +79,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, threshold_energy *= 1.0f - (-pca_threshold); float reduced_energy = 0.0f; - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { if(i >= 2 && reduced_energy >= threshold_energy) break; float s = feature_matrix[i*DENOISE_FEATURES+i]; @@ -84,7 +87,7 @@ ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, } } else { - for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + for(int i = 0; i < max_rank; i++, (*rank)++) { float s = feature_matrix[i*DENOISE_FEATURES+i]; if(i >= 2 && sqrtf(s) < pca_threshold) break; diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 7595e74e2d5..21da180bb8e 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -42,8 +42,6 @@ #include "util/util_types.h" #include "util/util_texture.h" -#define ccl_restrict_ptr const * __restrict - #define ccl_addr_space #define ccl_local_id(d) 0 diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 80d7401fbcf..988126f90e1 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,7 +55,6 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) -#define ccl_restrict_ptr const * __restrict__ #define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 15cf4b81b21..c2263ac0d49 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,8 +50,6 @@ # define ccl_addr_space #endif -#define ccl_restrict_ptr const * __restrict__ - #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 9708b4b5b58..ffd34c293fc 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -72,40 +72,40 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, - float *weightImage, + float *weight_image, float *variance, - float *differenceImage, + float *difference_image, int* rect, int w, int channel_offset, float a, float k_2); -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, - float *outImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, int* rect, int w, int f); -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, - float *outImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, int* rect, int w, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, - float *differenceImage, + float *difference_image, float *image, - float *outImage, - float *accumImage, + float *out_image, + float *accum_image, int* rect, int w, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, - float *differenceImage, + float *difference_image, float *buffer, float *color_pass, float *variance_pass, @@ -120,8 +120,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int f, int pass_stride); -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, - float *accumImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, int* rect, int w); diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 15325abdccd..261176846b1 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -150,9 +150,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, - float *weightImage, + float *weight_image, float *variance, - float *differenceImage, + float *difference_image, int *rect, int w, int channel_offset, @@ -162,12 +162,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, - float *outImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, int *rect, int w, int f) @@ -175,12 +175,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); #else - kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f); + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, - float *outImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, int *rect, int w, int f) @@ -188,16 +188,16 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); #else - kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f); + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, - float *differenceImage, + float *difference_image, float *image, - float *outImage, - float *accumImage, + float *out_image, + float *accum_image, int *rect, int w, int f) @@ -205,13 +205,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, - float *differenceImage, + float *difference_image, float *buffer, float *color_pass, float *variance_pass, @@ -229,19 +229,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); #endif } -void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, - float *accumImage, +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, int *rect, int w) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); #else - kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w); + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index f812a6601c6..2edbff08087 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -139,69 +139,74 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(int dx, int dy, - float ccl_restrict_ptr weightImage, - float ccl_restrict_ptr varianceImage, - float *differenceImage, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, int4 rect, int w, int channel_offset, - float a, float k_2) { + float a, float k_2) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_update_output(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr image, - float *outImage, float *accumImage, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, float *accum_image, int4 rect, int w, - int f) { + int f) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) { +kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, - float ccl_restrict_ptr differenceImage, - float ccl_restrict_ptr buffer, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, float *color_pass, float *variance_pass, float const* __restrict__ transform, @@ -211,13 +216,14 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, int4 rect, int4 filter_rect, int w, int h, int f, - int pass_stride) { + int pass_stride) +{ int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { kernel_filter_nlm_construct_gramian(x, y, dx, dy, - differenceImage, + difference_image, buffer, color_pass, variance_pass, transform, rank, @@ -235,7 +241,8 @@ kernel_cuda_filter_finalize(int w, int h, float *buffer, int *rank, float *XtWX, float3 *XtWY, int4 filter_area, int4 buffer_params, - int sample) { + int sample) +{ int x = blockDim.x*blockIdx.x + threadIdx.x; int y = blockDim.y*blockIdx.y + threadIdx.y; if(x < filter_area.z && y < filter_area.w) { diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index fbc3daa62b9..0462ca6f9bc 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -106,7 +106,7 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, } } -__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, ccl_global float *transform, ccl_global int *rank, int4 filter_area, @@ -132,79 +132,84 @@ __kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restric __kernel void kernel_ocl_filter_nlm_calc_difference(int dx, int dy, - ccl_global float ccl_restrict_ptr weightImage, - ccl_global float ccl_restrict_ptr varianceImage, - ccl_global float *differenceImage, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, int4 rect, int w, int channel_offset, float a, - float k_2) { + float k_2) +{ int x = get_global_id(0) + rect.x; int y = get_global_id(1) + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); } } -__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, int4 rect, int w, - int f) { + int f) +{ int x = get_global_id(0) + rect.x; int y = get_global_id(1) + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); } } -__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float *outImage, +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, int4 rect, int w, - int f) { + int f) +{ int x = get_global_id(0) + rect.x; int y = get_global_id(1) + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); } } __kernel void kernel_ocl_filter_nlm_update_output(int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr image, - ccl_global float *outImage, - ccl_global float *accumImage, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, int4 rect, int w, - int f) { + int f) +{ int x = get_global_id(0) + rect.x; int y = get_global_id(1) + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); } } -__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage, - ccl_global float ccl_restrict_ptr accumImage, +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, int4 rect, - int w) { + int w) +{ int x = get_global_id(0) + rect.x; int y = get_global_id(1) + rect.y; if(x < rect.z && y < rect.w) { - kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); } } __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, int dy, - ccl_global float ccl_restrict_ptr differenceImage, - ccl_global float ccl_restrict_ptr buffer, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, ccl_global float *color_pass, ccl_global float *variance_pass, - ccl_global float ccl_restrict_ptr transform, + const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, @@ -213,13 +218,14 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, int w, int h, int f, - int pass_stride) { + int pass_stride) +{ int x = get_global_id(0) + max(0, rect.x-filter_rect.x); int y = get_global_id(1) + max(0, rect.y-filter_rect.y); if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { kernel_filter_nlm_construct_gramian(x, y, dx, dy, - differenceImage, + difference_image, buffer, color_pass, variance_pass, transform, rank, @@ -239,7 +245,8 @@ __kernel void kernel_ocl_filter_finalize(int w, ccl_global float3 *XtWY, int4 filter_area, int4 buffer_params, - int sample) { + int sample) +{ int x = get_global_id(0); int y = get_global_id(1); if(x < filter_area.z && y < filter_area.w) { diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 1f6dce0253c..4c1fdd2d69c 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -111,7 +111,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 3f080407b1f..08909943c49 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -726,16 +726,20 @@ DeviceRequestedFeatures Session::get_requested_device_features() return requested_features; } -void Session::load_kernels() +void Session::load_kernels(bool lock_scene) { - thread_scoped_lock scene_lock(scene->mutex); + thread_scoped_lock scene_lock; + if(lock_scene) { + scene_lock = thread_scoped_lock(scene->mutex); + } - if(!kernels_loaded) { + DeviceRequestedFeatures requested_features = get_requested_device_features(); + + if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); scoped_timer timer; - DeviceRequestedFeatures requested_features = get_requested_device_features(); VLOG(2) << "Requested features:\n" << requested_features; if(!device->load_kernels(requested_features)) { string message = device->error_message(); @@ -752,6 +756,7 @@ void Session::load_kernels() VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start(); kernels_loaded = true; + loaded_kernel_features = requested_features; } } @@ -902,6 +907,8 @@ void Session::update_scene() /* update scene */ if(scene->need_update()) { + load_kernels(false); + progress.set_status("Updating Scene"); MEM_GUARDED_CALL(&progress, scene->device_update, device, progress); } diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index a7ca90abbce..5fb1a365ee9 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -158,7 +158,7 @@ public: void set_pause(bool pause); void update_scene(); - void load_kernels(); + void load_kernels(bool lock_scene=true); void device_free(); @@ -215,6 +215,7 @@ protected: thread_mutex display_mutex; bool kernels_loaded; + DeviceRequestedFeatures loaded_kernel_features; double reset_time; diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h index a754be413fe..5327d9f7cc6 100644 --- a/intern/cycles/util/util_math_float3.h +++ b/intern/cycles/util/util_math_float3.h @@ -367,6 +367,11 @@ ccl_device_inline bool isequal_float3(const float3 a, const float3 b) #endif } +ccl_device_inline bool isfinite3_safe(float3 v) +{ + return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z); +} + ccl_device_inline float3 ensure_finite3(float3 v) { if(!isfinite_safe(v.x)) v.x = 0.0; diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index 2172e94a14f..c7511f8306e 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -23,73 +23,83 @@ CCL_NAMESPACE_BEGIN /* Variants that use a constant stride on GPUS. */ #ifdef __KERNEL_GPU__ -#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] +# define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] /* Element access when only the lower-triangular elements are stored. */ -#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] -#define VECS(V, i, s) V[(i)*(s)] +# define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] +# define VECS(V, i, s) V[(i)*(s)] #else -#define MATS(A, n, r, c, s) MAT(A, n, r, c) -#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] -#define VECS(V, i, s) V[i] +# define MATS(A, n, r, c, s) MAT(A, n, r, c) +# define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] +# define VECS(V, i, s) V[i] #endif /* Zeroing helpers. */ ccl_device_inline void math_vector_zero(float *v, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { v[i] = 0.0f; + } } ccl_device_inline void math_matrix_zero(float *A, int n) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MAT(A, n, row, col) = 0.0f; + } + } } /* Elementary vector operations. */ -ccl_device_inline void math_vector_add(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i] += b[i]; + } } -ccl_device_inline void math_vector_mul(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i] *= b[i]; + } } -ccl_device_inline void math_vector_mul_strided(ccl_global float *a, float ccl_restrict_ptr b, int astride, int n) +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i*astride] *= b[i]; + } } ccl_device_inline void math_vector_scale(float *a, float b, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i] *= b; + } } -ccl_device_inline void math_vector_max(float *a, float ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i] = max(a[i], b[i]); + } } ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { v[i] += w*x[i]; + } } ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { v[i*stride] += w*x[i]; + } } /* Elementary matrix operations. @@ -97,33 +107,38 @@ ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride) { - for(int row = 0; row < n; row++) + for(int row = 0; row < n; row++) { MATHS(A, row, row, stride) += val; + } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ ccl_device_inline void math_matrix_add_gramian(float *A, int n, - float ccl_restrict_ptr v, + const float *ccl_restrict v, float weight) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MAT(A, n, row, col) += v[row]*v[col]*weight; + } + } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, int n, - float ccl_restrict_ptr v, + const float *ccl_restrict v, float weight, int stride) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MATHS(A, row, col, stride) += v[row]*v[col]*weight; + } + } } /* Transpose matrix A inplace. */ @@ -138,9 +153,6 @@ ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int str } } - - - /* Solvers for matrix problems */ /* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A @@ -199,10 +211,6 @@ ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global } } - - - - /* Perform the Jacobi Eigenvalue Methon on matrix A. * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed. * The algorithm overwrites the contents of A. @@ -215,15 +223,19 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float { const float singular_epsilon = 1e-9f; - for (int row = 0; row < n; row++) - for (int col = 0; col < n; col++) + for (int row = 0; row < n; row++) { + for (int col = 0; col < n; col++) { MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; + } + } for (int sweep = 0; sweep < 8; sweep++) { float off_diagonal = 0.0f; - for (int row = 1; row < n; row++) - for (int col = 0; col < row; col++) + for (int row = 1; row < n; row++) { + for (int col = 0; col < row; col++) { off_diagonal += fabsf(MAT(A, n, row, col)); + } + } if (off_diagonal < 1e-7f) { /* The matrix has nearly reached diagonal form. * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ @@ -327,51 +339,61 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float } #ifdef __KERNEL_SSE3__ - ccl_device_inline void math_vector_zero_sse(__m128 *A, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { A[i] = _mm_setzero_ps(); + } } + ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MAT(A, n, row, col) = _mm_setzero_ps(); + } + } } /* Add Gramian matrix of v to A. * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ -ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, __m128 ccl_restrict_ptr v, __m128 weight) +ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight)); + } + } } -ccl_device_inline void math_vector_add_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { V[i] = _mm_add_ps(V[i], a[i]); + } } -ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, __m128 ccl_restrict_ptr a) +ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { V[i] = _mm_mul_ps(V[i], a[i]); + } } -ccl_device_inline void math_vector_max_sse(__m128 *a, __m128 ccl_restrict_ptr b, int n) +ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n) { - for(int i = 0; i < n; i++) + for(int i = 0; i < n; i++) { a[i] = _mm_max_ps(a[i], b[i]); + } } -ccl_device_inline void math_matrix_hsum(float *A, int n, __m128 ccl_restrict_ptr B) +ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B) { - for(int row = 0; row < n; row++) - for(int col = 0; col <= row; col++) + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col)); + } + } } #endif diff --git a/intern/string/STR_HashedString.h b/intern/string/STR_HashedString.h index 8bfbde65895..ce790f398a0 100644 --- a/intern/string/STR_HashedString.h +++ b/intern/string/STR_HashedString.h @@ -38,6 +38,14 @@ #include "STR_String.h" +/* copied from 'BLI_compiler_attrs.h' */ +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +#if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +#define ATTR_FALLTHROUGH __attribute__((fallthrough)) +#else +#define ATTR_FALLTHROUGH ((void)0) +#endif + // Hash Mix utility function, by Bob Jenkins - Mix 3 32-bit values reversibly // @@ -102,16 +110,16 @@ static dword STR_gHash(const void *in, int len, dword init_val) // Handle the last 11 bytes c += len; switch (length) { - case 11: c += ((dword)p_in[10] << 24); - case 10: c += ((dword)p_in[9] << 16); - case 9: c += ((dword)p_in[8] << 8); /* the first byte of c is reserved for the length */ - case 8: b += ((dword)p_in[7] << 24); - case 7: b += ((dword)p_in[6] << 16); - case 6: b += ((dword)p_in[5] << 8); - case 5: b += p_in[4]; - case 4: a += ((dword)p_in[3] << 24); - case 3: a += ((dword)p_in[2] << 16); - case 2: a += ((dword)p_in[1] << 8); + case 11: c += ((dword)p_in[10] << 24); ATTR_FALLTHROUGH; + case 10: c += ((dword)p_in[9] << 16); ATTR_FALLTHROUGH; + case 9: c += ((dword)p_in[8] << 8); ATTR_FALLTHROUGH; /* the first byte of c is reserved for the length */ + case 8: b += ((dword)p_in[7] << 24); ATTR_FALLTHROUGH; + case 7: b += ((dword)p_in[6] << 16); ATTR_FALLTHROUGH; + case 6: b += ((dword)p_in[5] << 8); ATTR_FALLTHROUGH; + case 5: b += p_in[4]; ATTR_FALLTHROUGH; + case 4: a += ((dword)p_in[3] << 24); ATTR_FALLTHROUGH; + case 3: a += ((dword)p_in[2] << 16); ATTR_FALLTHROUGH; + case 2: a += ((dword)p_in[1] << 8); ATTR_FALLTHROUGH; case 1: a += p_in[0]; } STR_gHashMix(a, b, c); |