/* * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ CCL_NAMESPACE_BEGIN /* Determines pixel coordinates and offset for the current thread. * Returns whether the thread should do any work. * * All coordinates are relative to the denoising buffer! * * Window is the rect that should be processed. * co is filled with (x, y, dx, dy). */ ccl_device_inline bool get_nlm_coords_window( int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window) { /* Determine the pixel offset that this thread should apply. */ int s = 2 * r + 1; int si = ccl_global_id(1); int sx = si % s; int sy = si / s; if (sy >= s) { return false; } /* Pixels still need to lie inside the denoising buffer after applying the offset, * so determine the area for which this is the case. */ int dx = sx - r; int dy = sy - r; *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy)); /* Find the intersection of the area that we want to process (window) and the area * that can be processed (rect) to get the final area for this offset. */ int4 clip_area = rect_clip(window, *rect); /* If the radius is larger than one of the sides of the window, * there will be shifts for which there is no usable pixel at all. */ if (!rect_is_valid(clip_area)) { return false; } /* Map the linear thread index to pixels inside the clip area. */ int x, y; if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { return false; } *co = make_int4(x, y, dx, dy); *ofs = (sy * s + sx) * stride; return true; } ccl_device_inline bool get_nlm_coords( int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs) { return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); } ccl_device_inline void kernel_filter_nlm_calc_difference( int x, int y, int dx, int dy, const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, int frame_offset, float a, float k_2) { int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset; int numChannels = channel_offset ? 3 : 1; float diff = 0.0f; float scale_fac = 1.0f; if (scale_image) { scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); } for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q]; float pvar = variance_image[idx_p]; float qvar = sqr(scale_fac) * variance_image[idx_q]; diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar)); } if (numChannels > 1) { diff *= 1.0f / numChannels; } difference_image[y * stride + x] = diff; } ccl_device_inline void kernel_filter_nlm_blur(int x, int y, const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.y, y - f); const int high = min(rect.w, y + f + 1); for (int y1 = low; y1 < high; y1++) { sum += difference_image[y1 * stride + x]; } sum *= 1.0f / (high - low); out_image[y * stride + x] = sum; } ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x - f); const int high = min(rect.z, x + f + 1); for (int x1 = low; x1 < high; x1++) { sum += difference_image[y * stride + x1]; } sum *= 1.0f / (high - low); out_image[y * stride + x] = fast_expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, int4 rect, int channel_offset, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x - f); const int high = min(rect.z, x + f + 1); for (int x1 = low; x1 < high; x1++) { sum += difference_image[y * stride + x1]; } sum *= 1.0f / (high - low); int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); if (out_image) { atomic_add_and_fetch_float(accum_image + idx_p, sum); float val = image[idx_q]; if (channel_offset) { val += image[idx_q + channel_offset]; val += image[idx_q + 2 * channel_offset]; val *= 1.0f / 3.0f; } atomic_add_and_fetch_float(out_image + idx_p, sum * val); } else { accum_image[idx_p] = sum; } } ccl_device_inline void kernel_filter_nlm_construct_gramian( int x, int y, int dx, int dy, int t, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, int4 rect, int4 filter_window, int stride, int f, int pass_stride, int frame_offset, bool use_time, int localIdx) { const int low = max(rect.x, x - f); const int high = min(rect.z, x + f + 1); float sum = 0.0f; for (int x1 = low; x1 < high; x1++) { sum += difference_image[y * stride + x1]; } float weight = sum * (1.0f / (high - low)); /* Reconstruction data is only stored for pixels inside the filter window, * so compute the pixels's index in there. */ int storage_ofs = coord_to_local_index(filter_window, x, y); transform += storage_ofs; rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; kernel_filter_construct_gramian(x, y, rect_size(filter_window), dx, dy, t, stride, pass_stride, frame_offset, use_time, buffer, transform, rank, weight, XtWX, XtWY, localIdx); } ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, ccl_global float *out_image, const ccl_global float *ccl_restrict accum_image, int stride) { out_image[y * stride + x] /= accum_image[y * stride + x]; } CCL_NAMESPACE_END