/* * function: kernel_3d_denoise * 3D Noise Reduction * gain: The parameter determines the filtering strength for the reference block * threshold: Noise variances of observed image * restoredPrev: The previous restored image, image2d_t as read only * output: restored image, image2d_t as write only * input: observed image, image2d_t as read only * inputPrev1: reference image, image2d_t as read only * inputPrev2: reference image, image2d_t as read only */ #ifndef REFERENCE_FRAME_COUNT #define REFERENCE_FRAME_COUNT 2 #endif #ifndef ENABLE_IIR_FILERING #define ENABLE_IIR_FILERING 1 #endif #define ENABLE_GRADIENT 1 #ifndef WORKGROUP_WIDTH #define WORKGROUP_WIDTH 2 #endif #ifndef WORKGROUP_HEIGHT #define WORKGROUP_HEIGHT 32 #endif #define REF_BLOCK_X_OFFSET 1 #define REF_BLOCK_Y_OFFSET 4 #define REF_BLOCK_WIDTH (WORKGROUP_WIDTH + 2 * REF_BLOCK_X_OFFSET) #define REF_BLOCK_HEIGHT (WORKGROUP_HEIGHT + 2 * REF_BLOCK_Y_OFFSET) inline int2 subgroup_pos(const int sg_id, const int sg_lid) { int2 pos; pos.x = mad24(2, sg_id % 2, sg_lid % 2); pos.y = mad24(4, sg_id / 2, sg_lid / 2); return pos; } inline void average_slice(float8 ref, float8 observe, float8* restore, float2* sum_weight, float gain, float threshold, uint sg_id, uint sg_lid) { float8 grad = 0.0f; float8 gradient = 0.0f; float8 dist = 0.0f; float8 distance = 0.0f; float weight = 0.0f; #if ENABLE_GRADIENT // calculate & cumulate gradient if (sg_lid % 2 == 0) { grad = intel_sub_group_shuffle(ref, 4); } else { grad = intel_sub_group_shuffle(ref, 5); } gradient = (float8)(grad.s1, grad.s1, grad.s1, grad.s1, grad.s5, grad.s5, grad.s5, grad.s5); // normalize gradient "1/(4*255.0f) = 0.00098039f" grad = fabs(gradient - ref) * 0.00098039f; //grad = mad(-2, gradient, (ref + grad)) * 0.0004902f; grad.s0 = (grad.s0 + grad.s1 + grad.s2 + grad.s3); grad.s4 = (grad.s4 + grad.s5 + grad.s6 + grad.s7); #endif // calculate & normalize distance "1/255.0f = 0.00392157f" dist = (observe - ref) * 0.00392157f; dist = dist * dist; float8 dist_shuffle[8]; dist_shuffle[0] = (intel_sub_group_shuffle(dist, 0)); dist_shuffle[1] = (intel_sub_group_shuffle(dist, 1)); dist_shuffle[2] = (intel_sub_group_shuffle(dist, 2)); dist_shuffle[3] = (intel_sub_group_shuffle(dist, 3)); dist_shuffle[4] = (intel_sub_group_shuffle(dist, 4)); dist_shuffle[5] = (intel_sub_group_shuffle(dist, 5)); dist_shuffle[6] = (intel_sub_group_shuffle(dist, 6)); dist_shuffle[7] = (intel_sub_group_shuffle(dist, 7)); if (sg_lid % 2 == 0) { distance = dist_shuffle[0]; distance += dist_shuffle[2]; distance += dist_shuffle[4]; distance += dist_shuffle[6]; } else { distance = dist_shuffle[1]; distance += dist_shuffle[3]; distance += dist_shuffle[5]; distance += dist_shuffle[7]; } // cumulate distance dist.s0 = (distance.s0 + distance.s1 + distance.s2 + distance.s3); dist.s4 = (distance.s4 + distance.s5 + distance.s6 + distance.s7); gain = (grad.s0 < threshold) ? gain : 2.0f * gain; weight = native_exp(-gain * dist.s0); (*restore).lo = mad(weight, ref.lo, (*restore).lo); (*sum_weight).lo = (*sum_weight).lo + weight; gain = (grad.s4 < threshold) ? gain : 2.0f * gain; weight = native_exp(-gain * dist.s4); (*restore).hi = mad(weight, ref.hi, (*restore).hi); (*sum_weight).hi = (*sum_weight).hi + weight; } inline void weighted_average (__read_only image2d_t input, __local uchar8* ref_cache, bool load_observe, float8* observe, float8* restore, float2* sum_weight, float gain, float threshold, uint sg_id, uint sg_lid) { sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; int local_id_x = get_local_id(0); int local_id_y = get_local_id(1); const int group_id_x = get_group_id(0); const int group_id_y = get_group_id(1); int start_x = mad24(group_id_x, WORKGROUP_WIDTH, -REF_BLOCK_X_OFFSET); int start_y = mad24(group_id_y, WORKGROUP_HEIGHT, -REF_BLOCK_Y_OFFSET); int i = local_id_x + local_id_y * WORKGROUP_WIDTH; for ( int j = i; j < (REF_BLOCK_HEIGHT * REF_BLOCK_WIDTH); j += (WORKGROUP_HEIGHT * WORKGROUP_WIDTH) ) { int corrd_x = start_x + (j % REF_BLOCK_WIDTH); int corrd_y = start_y + (j / REF_BLOCK_WIDTH); ref_cache[j] = as_uchar8( convert_ushort4(read_imageui(input, sampler, (int2)(corrd_x, corrd_y)))); } barrier(CLK_LOCAL_MEM_FENCE); #if WORKGROUP_WIDTH == 4 int2 pos = subgroup_pos(sg_id, sg_lid); local_id_x = pos.x; local_id_y = pos.y; #endif if (load_observe) { (*observe) = convert_float8( ref_cache[mad24(local_id_y + REF_BLOCK_Y_OFFSET, REF_BLOCK_WIDTH, local_id_x + REF_BLOCK_X_OFFSET)]); (*restore) = (*observe); (*sum_weight) = 1.0f; } float8 ref[2] = {0.0f, 0.0f}; __local uchar4* p_ref = (__local uchar4*)(ref_cache); // top-left ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24(local_id_y, 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 1)))); average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // top-right ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24(local_id_y, 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 3)))); average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // top-mid average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // mid-left ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 4), 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 1)))); average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // mid-right ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 4), 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 3)))); average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // mid-mid if (!load_observe) { average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); } // bottom-left ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 8), 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 1)))); average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // bottom-right ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 8), 2 * REF_BLOCK_WIDTH, mad24(2, local_id_x, 3)))); average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); // bottom-mid average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid); } __kernel void kernel_3d_denoise ( float gain, float threshold, __read_only image2d_t restoredPrev, __write_only image2d_t output, __read_only image2d_t input, __read_only image2d_t inputPrev1, __read_only image2d_t inputPrev2) { float8 restore = 0.0f; float8 observe = 0.0f; float2 sum_weight = 0.0f; const int sg_id = get_sub_group_id(); const int sg_lid = (get_local_id(1) * WORKGROUP_WIDTH + get_local_id(0)) % 8; __local uchar8 ref_cache[REF_BLOCK_HEIGHT * REF_BLOCK_WIDTH]; weighted_average (input, ref_cache, true, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid); #if ENABLE_IIR_FILERING weighted_average (restoredPrev, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid); #else #if REFERENCE_FRAME_COUNT > 1 weighted_average (inputPrev1, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid); #endif #if REFERENCE_FRAME_COUNT > 2 weighted_average (inputPrev2, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid); #endif #endif restore.lo = restore.lo / sum_weight.lo; restore.hi = restore.hi / sum_weight.hi; int local_id_x = get_local_id(0); int local_id_y = get_local_id(1); const int group_id_x = get_group_id(0); const int group_id_y = get_group_id(1); #if WORKGROUP_WIDTH == 4 int2 pos = subgroup_pos(sg_id, sg_lid); local_id_x = pos.x; local_id_y = pos.y; #endif int coor_x = mad24(group_id_x, WORKGROUP_WIDTH, local_id_x); int coor_y = mad24(group_id_y, WORKGROUP_HEIGHT, local_id_y); write_imageui(output, (int2)(coor_x, coor_y), convert_uint4(as_ushort4(convert_uchar8(restore)))); }