Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/filter/filter_nlm_gpu.h')
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h365
1 files changed, 197 insertions, 168 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 12636393243..650c743f34f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN
* Window is the rect that should be processed.
* co is filled with (x, y, dx, dy).
*/
-ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
- int4 *rect, int4 *co, int *ofs,
- int4 window)
+ccl_device_inline bool get_nlm_coords_window(
+ int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
{
- /* Determine the pixel offset that this thread should apply. */
- int s = 2*r+1;
- int si = ccl_global_id(1);
- int sx = si % s;
- int sy = si / s;
- if(sy >= s) {
- return false;
- }
-
- /* Pixels still need to lie inside the denoising buffer after applying the offset,
- * so determine the area for which this is the case. */
- int dx = sx - r;
- int dy = sy - r;
-
- *rect = make_int4(max(0, -dx), max(0, -dy),
- w - max(0, dx), h - max(0, dy));
-
- /* Find the intersection of the area that we want to process (window) and the area
- * that can be processed (rect) to get the final area for this offset. */
- int4 clip_area = rect_clip(window, *rect);
-
- /* If the radius is larger than one of the sides of the window,
- * there will be shifts for which there is no usable pixel at all. */
- if(!rect_is_valid(clip_area)) {
- return false;
- }
-
- /* Map the linear thread index to pixels inside the clip area. */
- int x, y;
- if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
- return false;
- }
-
- *co = make_int4(x, y, dx, dy);
-
- *ofs = (sy*s + sx) * stride;
-
- return true;
+ /* Determine the pixel offset that this thread should apply. */
+ int s = 2 * r + 1;
+ int si = ccl_global_id(1);
+ int sx = si % s;
+ int sy = si / s;
+ if (sy >= s) {
+ return false;
+ }
+
+ /* Pixels still need to lie inside the denoising buffer after applying the offset,
+ * so determine the area for which this is the case. */
+ int dx = sx - r;
+ int dy = sy - r;
+
+ *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
+
+ /* Find the intersection of the area that we want to process (window) and the area
+ * that can be processed (rect) to get the final area for this offset. */
+ int4 clip_area = rect_clip(window, *rect);
+
+ /* If the radius is larger than one of the sides of the window,
+ * there will be shifts for which there is no usable pixel at all. */
+ if (!rect_is_valid(clip_area)) {
+ return false;
+ }
+
+ /* Map the linear thread index to pixels inside the clip area. */
+ int x, y;
+ if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+ return false;
+ }
+
+ *co = make_int4(x, y, dx, dy);
+
+ *ofs = (sy * s + sx) * stride;
+
+ return true;
}
-ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
- int4 *rect, int4 *co, int *ofs)
+ccl_device_inline bool get_nlm_coords(
+ int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
{
- return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+ return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
}
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
- int dx, int dy,
- const ccl_global float *ccl_restrict weight_image,
- const ccl_global float *ccl_restrict variance_image,
- const ccl_global float *ccl_restrict scale_image,
- ccl_global float *difference_image,
- int4 rect, int stride,
- int channel_offset,
- int frame_offset,
- float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(
+ int x,
+ int y,
+ int dx,
+ int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
+ ccl_global float *difference_image,
+ int4 rect,
+ int stride,
+ int channel_offset,
+ int frame_offset,
+ float a,
+ float k_2)
{
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
- int numChannels = channel_offset? 3 : 1;
-
- float diff = 0.0f;
- float scale_fac = 1.0f;
- if(scale_image) {
- scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
- }
-
- for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
- float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
- float pvar = variance_image[idx_p];
- float qvar = sqr(scale_fac)*variance_image[idx_q];
- diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
- }
- if(numChannels > 1) {
- diff *= 1.0f/numChannels;
- }
- difference_image[y*stride + x] = diff;
+ int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
+ int numChannels = channel_offset ? 3 : 1;
+
+ float diff = 0.0f;
+ float scale_fac = 1.0f;
+ if (scale_image) {
+ scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+ }
+
+ for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+ float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
+ float pvar = variance_image[idx_p];
+ float qvar = sqr(scale_fac) * variance_image[idx_q];
+ diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
+ }
+ if (numChannels > 1) {
+ diff *= 1.0f / numChannels;
+ }
+ difference_image[y * stride + x] = diff;
}
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_blur(int x,
+ int y,
+ const ccl_global float *ccl_restrict
+ difference_image,
ccl_global float *out_image,
- int4 rect, int stride, int f)
+ int4 rect,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.y, y-f);
- const int high = min(rect.w, y+f+1);
- for(int y1 = low; y1 < high; y1++) {
- sum += difference_image[y1*stride + x];
- }
- sum *= 1.0f/(high-low);
- out_image[y*stride + x] = sum;
+ float sum = 0.0f;
+ const int low = max(rect.y, y - f);
+ const int high = min(rect.w, y + f + 1);
+ for (int y1 = low; y1 < high; y1++) {
+ sum += difference_image[y1 * stride + x];
+ }
+ sum *= 1.0f / (high - low);
+ out_image[y * stride + x] = sum;
}
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
+ int y,
+ const ccl_global float *ccl_restrict
+ difference_image,
ccl_global float *out_image,
- int4 rect, int stride, int f)
+ int4 rect,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- sum *= 1.0f/(high-low);
- out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
+ float sum = 0.0f;
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ sum *= 1.0f / (high - low);
+ out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
}
-ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
- int dx, int dy,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_update_output(int x,
+ int y,
+ int dx,
+ int dy,
+ const ccl_global float *ccl_restrict
+ difference_image,
const ccl_global float *ccl_restrict image,
ccl_global float *out_image,
ccl_global float *accum_image,
- int4 rect, int channel_offset,
- int stride, int f)
+ int4 rect,
+ int channel_offset,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- sum *= 1.0f/(high-low);
-
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
- if(out_image) {
- atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
- float val = image[idx_q];
- if(channel_offset) {
- val += image[idx_q + channel_offset];
- val += image[idx_q + 2*channel_offset];
- val *= 1.0f/3.0f;
- }
- atomic_add_and_fetch_float(out_image + idx_p, sum*val);
- }
- else {
- accum_image[idx_p] = sum;
- }
+ float sum = 0.0f;
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ sum *= 1.0f / (high - low);
+
+ int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
+ if (out_image) {
+ atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+ float val = image[idx_q];
+ if (channel_offset) {
+ val += image[idx_q + channel_offset];
+ val += image[idx_q + 2 * channel_offset];
+ val *= 1.0f / 3.0f;
+ }
+ atomic_add_and_fetch_float(out_image + idx_p, sum * val);
+ }
+ else {
+ accum_image[idx_p] = sum;
+ }
}
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
- int dx, int dy, int t,
- const ccl_global float *ccl_restrict difference_image,
- const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict transform,
- ccl_global int *rank,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 rect,
- int4 filter_window,
- int stride, int f,
- int pass_stride,
- int frame_offset,
- bool use_time,
- int localIdx)
+ccl_device_inline void kernel_filter_nlm_construct_gramian(
+ int x,
+ int y,
+ int dx,
+ int dy,
+ int t,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_window,
+ int stride,
+ int f,
+ int pass_stride,
+ int frame_offset,
+ bool use_time,
+ int localIdx)
{
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- float sum = 0.0f;
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- float weight = sum * (1.0f/(high - low));
-
- /* Reconstruction data is only stored for pixels inside the filter window,
- * so compute the pixels's index in there. */
- int storage_ofs = coord_to_local_index(filter_window, x, y);
- transform += storage_ofs;
- rank += storage_ofs;
- XtWX += storage_ofs;
- XtWY += storage_ofs;
-
- kernel_filter_construct_gramian(x, y,
- rect_size(filter_window),
- dx, dy, t,
- stride,
- pass_stride,
- frame_offset,
- use_time,
- buffer,
- transform, rank,
- weight, XtWX, XtWY,
- localIdx);
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ float sum = 0.0f;
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ float weight = sum * (1.0f / (high - low));
+
+ /* Reconstruction data is only stored for pixels inside the filter window,
+ * so compute the pixels's index in there. */
+ int storage_ofs = coord_to_local_index(filter_window, x, y);
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x,
+ y,
+ rect_size(filter_window),
+ dx,
+ dy,
+ t,
+ stride,
+ pass_stride,
+ frame_offset,
+ use_time,
+ buffer,
+ transform,
+ rank,
+ weight,
+ XtWX,
+ XtWY,
+ localIdx);
}
-ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ccl_device_inline void kernel_filter_nlm_normalize(int x,
+ int y,
ccl_global float *out_image,
- const ccl_global float *ccl_restrict accum_image,
+ const ccl_global float *ccl_restrict
+ accum_image,
int stride)
{
- out_image[y*stride + x] /= accum_image[y*stride + x];
+ out_image[y * stride + x] /= accum_image[y * stride + x];
}
CCL_NAMESPACE_END