diff options
Diffstat (limited to 'intern/cycles/kernel/filter/filter_nlm_gpu.h')
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 365 |
1 files changed, 197 insertions, 168 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 12636393243..650c743f34f 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN * Window is the rect that should be processed. * co is filled with (x, y, dx, dy). */ -ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride, - int4 *rect, int4 *co, int *ofs, - int4 window) +ccl_device_inline bool get_nlm_coords_window( + int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window) { - /* Determine the pixel offset that this thread should apply. */ - int s = 2*r+1; - int si = ccl_global_id(1); - int sx = si % s; - int sy = si / s; - if(sy >= s) { - return false; - } - - /* Pixels still need to lie inside the denoising buffer after applying the offset, - * so determine the area for which this is the case. */ - int dx = sx - r; - int dy = sy - r; - - *rect = make_int4(max(0, -dx), max(0, -dy), - w - max(0, dx), h - max(0, dy)); - - /* Find the intersection of the area that we want to process (window) and the area - * that can be processed (rect) to get the final area for this offset. */ - int4 clip_area = rect_clip(window, *rect); - - /* If the radius is larger than one of the sides of the window, - * there will be shifts for which there is no usable pixel at all. */ - if(!rect_is_valid(clip_area)) { - return false; - } - - /* Map the linear thread index to pixels inside the clip area. */ - int x, y; - if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { - return false; - } - - *co = make_int4(x, y, dx, dy); - - *ofs = (sy*s + sx) * stride; - - return true; + /* Determine the pixel offset that this thread should apply. */ + int s = 2 * r + 1; + int si = ccl_global_id(1); + int sx = si % s; + int sy = si / s; + if (sy >= s) { + return false; + } + + /* Pixels still need to lie inside the denoising buffer after applying the offset, + * so determine the area for which this is the case. */ + int dx = sx - r; + int dy = sy - r; + + *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy)); + + /* Find the intersection of the area that we want to process (window) and the area + * that can be processed (rect) to get the final area for this offset. */ + int4 clip_area = rect_clip(window, *rect); + + /* If the radius is larger than one of the sides of the window, + * there will be shifts for which there is no usable pixel at all. */ + if (!rect_is_valid(clip_area)) { + return false; + } + + /* Map the linear thread index to pixels inside the clip area. */ + int x, y; + if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { + return false; + } + + *co = make_int4(x, y, dx, dy); + + *ofs = (sy * s + sx) * stride; + + return true; } -ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride, - int4 *rect, int4 *co, int *ofs) +ccl_device_inline bool get_nlm_coords( + int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs) { - return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); + return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); } -ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, - int dx, int dy, - const ccl_global float *ccl_restrict weight_image, - const ccl_global float *ccl_restrict variance_image, - const ccl_global float *ccl_restrict scale_image, - ccl_global float *difference_image, - int4 rect, int stride, - int channel_offset, - int frame_offset, - float a, float k_2) +ccl_device_inline void kernel_filter_nlm_calc_difference( + int x, + int y, + int dx, + int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, + ccl_global float *difference_image, + int4 rect, + int stride, + int channel_offset, + int frame_offset, + float a, + float k_2) { - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset; - int numChannels = channel_offset? 3 : 1; - - float diff = 0.0f; - float scale_fac = 1.0f; - if(scale_image) { - scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); - } - - for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { - float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q]; - float pvar = variance_image[idx_p]; - float qvar = sqr(scale_fac)*variance_image[idx_q]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); - } - if(numChannels > 1) { - diff *= 1.0f/numChannels; - } - difference_image[y*stride + x] = diff; + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset; + int numChannels = channel_offset ? 3 : 1; + + float diff = 0.0f; + float scale_fac = 1.0f; + if (scale_image) { + scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); + } + + for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { + float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q]; + float pvar = variance_image[idx_p]; + float qvar = sqr(scale_fac) * variance_image[idx_q]; + diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar)); + } + if (numChannels > 1) { + diff *= 1.0f / numChannels; + } + difference_image[y * stride + x] = diff; } -ccl_device_inline void kernel_filter_nlm_blur(int x, int y, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_blur(int x, + int y, + const ccl_global float *ccl_restrict + difference_image, ccl_global float *out_image, - int4 rect, int stride, int f) + int4 rect, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.y, y-f); - const int high = min(rect.w, y+f+1); - for(int y1 = low; y1 < high; y1++) { - sum += difference_image[y1*stride + x]; - } - sum *= 1.0f/(high-low); - out_image[y*stride + x] = sum; + float sum = 0.0f; + const int low = max(rect.y, y - f); + const int high = min(rect.w, y + f + 1); + for (int y1 = low; y1 < high; y1++) { + sum += difference_image[y1 * stride + x]; + } + sum *= 1.0f / (high - low); + out_image[y * stride + x] = sum; } -ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, + int y, + const ccl_global float *ccl_restrict + difference_image, ccl_global float *out_image, - int4 rect, int stride, int f) + int4 rect, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - sum *= 1.0f/(high-low); - out_image[y*stride + x] = fast_expf(-max(sum, 0.0f)); + float sum = 0.0f; + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + sum *= 1.0f / (high - low); + out_image[y * stride + x] = fast_expf(-max(sum, 0.0f)); } -ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, - int dx, int dy, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_update_output(int x, + int y, + int dx, + int dy, + const ccl_global float *ccl_restrict + difference_image, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int channel_offset, - int stride, int f) + int4 rect, + int channel_offset, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - sum *= 1.0f/(high-low); - - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); - if(out_image) { - atomic_add_and_fetch_float(accum_image + idx_p, sum); - - float val = image[idx_q]; - if(channel_offset) { - val += image[idx_q + channel_offset]; - val += image[idx_q + 2*channel_offset]; - val *= 1.0f/3.0f; - } - atomic_add_and_fetch_float(out_image + idx_p, sum*val); - } - else { - accum_image[idx_p] = sum; - } + float sum = 0.0f; + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + sum *= 1.0f / (high - low); + + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); + if (out_image) { + atomic_add_and_fetch_float(accum_image + idx_p, sum); + + float val = image[idx_q]; + if (channel_offset) { + val += image[idx_q + channel_offset]; + val += image[idx_q + 2 * channel_offset]; + val *= 1.0f / 3.0f; + } + atomic_add_and_fetch_float(out_image + idx_p, sum * val); + } + else { + accum_image[idx_p] = sum; + } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, - int dx, int dy, int t, - const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, int f, - int pass_stride, - int frame_offset, - bool use_time, - int localIdx) +ccl_device_inline void kernel_filter_nlm_construct_gramian( + int x, + int y, + int dx, + int dy, + int t, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_window, + int stride, + int f, + int pass_stride, + int frame_offset, + bool use_time, + int localIdx) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - - /* Reconstruction data is only stored for pixels inside the filter window, - * so compute the pixels's index in there. */ - int storage_ofs = coord_to_local_index(filter_window, x, y); - transform += storage_ofs; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - - kernel_filter_construct_gramian(x, y, - rect_size(filter_window), - dx, dy, t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - transform, rank, - weight, XtWX, XtWY, - localIdx); + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + float sum = 0.0f; + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + float weight = sum * (1.0f / (high - low)); + + /* Reconstruction data is only stored for pixels inside the filter window, + * so compute the pixels's index in there. */ + int storage_ofs = coord_to_local_index(filter_window, x, y); + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, + y, + rect_size(filter_window), + dx, + dy, + t, + stride, + pass_stride, + frame_offset, + use_time, + buffer, + transform, + rank, + weight, + XtWX, + XtWY, + localIdx); } -ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, +ccl_device_inline void kernel_filter_nlm_normalize(int x, + int y, ccl_global float *out_image, - const ccl_global float *ccl_restrict accum_image, + const ccl_global float *ccl_restrict + accum_image, int stride) { - out_image[y*stride + x] /= accum_image[y*stride + x]; + out_image[y * stride + x] /= accum_image[y * stride + x]; } CCL_NAMESPACE_END |