| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/ctypes.h" | ||
| 6 | #include "kleidicv/filters/median_blur.h" | ||
| 7 | #include "kleidicv/kleidicv.h" | ||
| 8 | #include "kleidicv/neon.h" | ||
| 9 | #include "median_blur_border_handling.h" | ||
| 10 | |||
| 11 | namespace kleidicv::neon { | ||
| 12 | |||
| 13 | template <bool is_single_channel> | ||
| 14 | class MedianBlurLargeHist { | ||
| 15 | public: | ||
| 16 | 140 | MedianBlurLargeHist(size_t channels, size_t kMargin) | |
| 17 | 280 | : patched_coarse{static_cast<uint16_t*>( | |
| 18 | 140 | malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * | |
| 19 | sizeof(uint16_t)))}, | ||
| 20 | 280 | patched_fine{ | |
| 21 | 140 | &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]}, | |
| 22 | 140 | H{}, | |
| 23 | 140 | luc{} {} | |
| 24 | |||
| 25 | 140 | ~MedianBlurLargeHist() { free(patched_coarse); } | |
| 26 | |||
| 27 | 140 | void process_pixels_without_horizontal_borders( | |
| 28 | Rectangle image_dimensions, Point starting_coordinates, | ||
| 29 | Point ending_coordinates, Rows<const uint8_t> src_rows, | ||
| 30 | Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) { | ||
| 31 | 140 | const size_t kMargin = ksize / 2; | |
| 32 | |||
| 33 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 64 times.
|
280 | for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); |
| 34 | 140 | w += patch_length) { | |
| 35 | 280 | const size_t total_patch_span = | |
| 36 | 140 | std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; | |
| 37 | |||
| 38 | 280 | Rows<uint8_t> shifted_dst{ | |
| 39 | 140 | &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(), | |
| 40 | 140 | dst_rows.stride(), dst_rows.channels()}; | |
| 41 | |||
| 42 | 140 | clear_patched_histogram(total_patch_span * src_rows.channels()); | |
| 43 | |||
| 44 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 64 times.
|
408 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
| 45 | 268 | ++c) { | |
| 46 | 268 | clear_lookup_table(); | |
| 47 | |||
| 48 | // We initialize with ksize rows to allow merging of | ||
| 49 | // histogram increment and decrement operations in the main loop. | ||
| 50 | // This extra initial load enables a single update phase and avoids | ||
| 51 | // splitting the logic into separate steps. | ||
| 52 |
4/4✓ Branch 0 taken 1868 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 192 times.
|
7128 | for (size_t r = 0; r < ksize; ++r) { |
| 53 | 13720 | const ptrdiff_t valid_h = | |
| 54 | 13720 | get_physical_index(starting_coordinates.y() + r - kMargin, | |
| 55 | 6860 | image_dimensions.height(), border_type); | |
| 56 | 6860 | initialize_patched_histogram_without_horizontal_borders( | |
| 57 | 6860 | src_rows, c, valid_h, total_patch_span, w, kMargin); | |
| 58 | 6860 | } | |
| 59 | 536 | compute_patch_median_from_histogram(starting_coordinates.y(), c, | |
| 60 | 268 | total_patch_span, kMargin, ksize, | |
| 61 | 268 | shifted_dst); | |
| 62 | 268 | } | |
| 63 | |||
| 64 |
4/4✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 2656 times.
✓ Branch 3 taken 64 times.
|
5712 | for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); |
| 65 | 5572 | ++h) { | |
| 66 | 11144 | const ptrdiff_t valid_old_h = get_physical_index( | |
| 67 | 5572 | h - kMargin - 1, image_dimensions.height(), border_type); | |
| 68 | |||
| 69 | 11144 | const ptrdiff_t valid_new_h = get_physical_index( | |
| 70 | 5572 | h + kMargin, image_dimensions.height(), border_type); | |
| 71 | |||
| 72 |
4/4✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 2916 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 2656 times.
|
16456 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
| 73 | 10884 | ++c) { | |
| 74 | 10884 | clear_lookup_table(); | |
| 75 | |||
| 76 | 10884 | update_patch_histogram_without_horizontal_borders( | |
| 77 | 10884 | src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, | |
| 78 | 10884 | kMargin); | |
| 79 | |||
| 80 | 21768 | compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, | |
| 81 | 10884 | ksize, shifted_dst); | |
| 82 | 10884 | } | |
| 83 | 5572 | } | |
| 84 | 140 | } | |
| 85 | 140 | } | |
| 86 | |||
| 87 | 280 | void process_pixels_with_horizontal_borders( | |
| 88 | Rectangle image_dimensions, Point starting_coordinates, | ||
| 89 | Point ending_coordinates, Rows<const uint8_t> src_rows, | ||
| 90 | Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) { | ||
| 91 | 280 | const size_t kMargin = ksize / 2; | |
| 92 | |||
| 93 |
4/4✓ Branch 0 taken 152 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 128 times.
|
560 | for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); |
| 94 | 280 | w += patch_length) { | |
| 95 | 560 | const size_t total_patch_span = | |
| 96 | 280 | std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; | |
| 97 | 560 | Rows<uint8_t> shifted_dst{ | |
| 98 | 280 | &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(), | |
| 99 | 280 | dst_rows.stride(), dst_rows.channels()}; | |
| 100 | |||
| 101 | 280 | clear_patched_histogram(total_patch_span * src_rows.channels()); | |
| 102 | |||
| 103 |
4/4✓ Branch 0 taken 152 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 384 times.
✓ Branch 3 taken 128 times.
|
816 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
| 104 | 536 | ++c) { | |
| 105 | 536 | clear_lookup_table(); | |
| 106 | // We initialize with ksize rows to allow merging of | ||
| 107 | // histogram increment and decrement operations in the main loop. | ||
| 108 | // This extra initial load enables a single update phase and avoids | ||
| 109 | // splitting the logic into separate steps. | ||
| 110 |
4/4✓ Branch 0 taken 3736 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 384 times.
|
14256 | for (size_t r = 0; r < ksize; ++r) { |
| 111 | 27440 | const ptrdiff_t valid_h = | |
| 112 | 27440 | get_physical_index(starting_coordinates.y() + r - kMargin, | |
| 113 | 13720 | image_dimensions.height(), border_type); | |
| 114 | 13720 | initialize_patched_histogram_with_horizontal_borders( | |
| 115 | 13720 | src_rows, c, valid_h, total_patch_span, w, kMargin, | |
| 116 | 13720 | image_dimensions.width(), border_type); | |
| 117 | 13720 | } | |
| 118 | 1072 | compute_patch_median_from_histogram(starting_coordinates.y(), c, | |
| 119 | 536 | total_patch_span, kMargin, ksize, | |
| 120 | 536 | shifted_dst); | |
| 121 | 536 | } | |
| 122 | |||
| 123 |
4/4✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 5312 times.
✓ Branch 3 taken 128 times.
|
11424 | for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); |
| 124 | 11144 | ++h) { | |
| 125 | 22288 | const ptrdiff_t valid_old_h = get_physical_index( | |
| 126 | 11144 | h - kMargin - 1, image_dimensions.height(), border_type); | |
| 127 | 22288 | const ptrdiff_t valid_new_h = get_physical_index( | |
| 128 | 11144 | h + kMargin, image_dimensions.height(), border_type); | |
| 129 |
4/4✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 5832 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 5312 times.
|
32912 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
| 130 | 21768 | ++c) { | |
| 131 | 21768 | clear_lookup_table(); | |
| 132 | |||
| 133 | 21768 | update_patch_histogram_with_horizontal_borders( | |
| 134 | 21768 | src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, | |
| 135 | 21768 | kMargin, image_dimensions.width(), border_type); | |
| 136 | |||
| 137 | 43536 | compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, | |
| 138 | 21768 | ksize, shifted_dst); | |
| 139 | 21768 | } | |
| 140 | 11144 | } | |
| 141 | 280 | } | |
| 142 | 280 | } | |
| 143 | |||
| 144 | private: | ||
| 145 | // Histogram Buffer Layout Explanation | ||
| 146 | // ----------------------------------- | ||
| 147 | // patched_coarse: | ||
| 148 | // - Conceptually a 3D array: | ||
| 149 | // coarse[channel_idx][patch_offset][coarse_bin] | ||
| 150 | // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] | ||
| 151 | // - Flattened as: | ||
| 152 | // coarse_offset = 16 * (patch_length * channel_idx + patch_offset) | ||
| 153 | // + (incoming_pixel >> 4); | ||
| 154 | // | ||
| 155 | // patched_fine: | ||
| 156 | // - Conceptually a 4D array: | ||
| 157 | // fine[channel_idx][coarse_bin][patch_offset][fine_bin] | ||
| 158 | // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] | ||
| 159 | // - fine_bin = incoming_pixel & 0xF ∈ [0, 15] | ||
| 160 | // - Flattened as: | ||
| 161 | // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin) | ||
| 162 | // + patch_offset) + fine_bin; | ||
| 163 | // | ||
| 164 | // This layout enables fast linear access while preserving the hierarchical | ||
| 165 | // structure of histograms per channel and patch position. | ||
| 166 | constexpr static size_t patch_length = 512; | ||
| 167 | uint16_t* patched_coarse; | ||
| 168 | uint16_t* patched_fine; | ||
| 169 | |||
| 170 | // Clear only the portion of the patched histogram buffers that will be used | ||
| 171 | // for the current patch. | ||
| 172 | // Since these buffers are large, there's no need to zero out the entire | ||
| 173 | // allocation— only the section relevant to the current total patch size is | ||
| 174 | // cleared for efficiency. | ||
| 175 | 420 | void clear_patched_histogram(size_t total_patch_size) { | |
| 176 | 840 | std::memset(patched_coarse, 0, | |
| 177 | 420 | 16 * total_patch_size * sizeof(patched_coarse[0])); | |
| 178 | 840 | std::memset(patched_fine, 0, | |
| 179 | 420 | 256 * total_patch_size * sizeof(patched_fine[0])); | |
| 180 | 420 | } | |
| 181 | |||
| 182 | 892320 | void scalar_initialize_patched_histogram(int incoming_pixel, | |
| 183 | size_t channel_idx, | ||
| 184 | size_t patch_length, | ||
| 185 | size_t patch_offset) { | ||
| 186 | 1784640 | const size_t coarse_offset = | |
| 187 | 1784640 | 16 * (patch_length * channel_idx + patch_offset) + | |
| 188 | 892320 | (incoming_pixel >> 4); | |
| 189 | 1784640 | const size_t fine_offset = | |
| 190 | 1784640 | 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) + | |
| 191 | 1784640 | patch_offset) + | |
| 192 | 892320 | (incoming_pixel & 0xF); | |
| 193 | 892320 | patched_coarse[coarse_offset]++; | |
| 194 | 892320 | patched_fine[fine_offset]++; | |
| 195 | 892320 | } | |
| 196 | |||
| 197 | 5400 | void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels, | |
| 198 | size_t channel_idx, | ||
| 199 | size_t patch_length, | ||
| 200 | size_t patch_offset) { | ||
| 201 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 202 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 86400 times.
|
91800 | for (int i = 0; i < 16; i++) { |
| 203 | 172800 | const size_t coarse_offset_incoming = | |
| 204 | 172800 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
| 205 | 86400 | (incoming_pixels[i] >> 4); | |
| 206 | |||
| 207 | 172800 | const size_t fine_offset_incoming = | |
| 208 | 259200 | 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + | |
| 209 | 259200 | patch_offset + i) + | |
| 210 | 86400 | (incoming_pixels[i] & 0xF); | |
| 211 | |||
| 212 | 86400 | patched_coarse[coarse_offset_incoming]++; | |
| 213 | 86400 | patched_fine[fine_offset_incoming]++; | |
| 214 | 86400 | } | |
| 215 | 5400 | } | |
| 216 | |||
| 217 | 6860 | void initialize_patched_histogram_without_horizontal_borders( | |
| 218 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h, | ||
| 219 | size_t total_patch_span, size_t starting_width, size_t kMargin) { | ||
| 220 | 6860 | size_t vector_part = 0; | |
| 221 | if constexpr (is_single_channel) { | ||
| 222 | 1868 | vector_part = (total_patch_span >> 4) << 4; | |
| 223 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1868 times.
|
7268 | for (size_t patch_offset = 0; patch_offset < vector_part; |
| 224 | 5400 | patch_offset += 16) { | |
| 225 | 10800 | const ptrdiff_t valid_w = | |
| 226 | 5400 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
| 227 | 5400 | auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]); | |
| 228 | 10800 | vector_initialize_patched_histogram(incoming_pixels, c, | |
| 229 | 5400 | total_patch_span, patch_offset); | |
| 230 | 5400 | } | |
| 231 | } | ||
| 232 | |||
| 233 |
4/4✓ Branch 0 taken 1868 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 299520 times.
|
327980 | for (size_t patch_offset = vector_part; patch_offset < total_patch_span; |
| 234 | 321120 | ++patch_offset) { | |
| 235 | 642240 | const ptrdiff_t valid_w = | |
| 236 | 321120 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
| 237 | 321120 | auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; | |
| 238 | |||
| 239 | 642240 | scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, | |
| 240 | 321120 | patch_offset); | |
| 241 | 321120 | } | |
| 242 | 6860 | } | |
| 243 | |||
| 244 | 13720 | void initialize_patched_histogram_with_horizontal_borders( | |
| 245 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h, | ||
| 246 | size_t total_patch_span, size_t starting_width, size_t kMargin, | ||
| 247 | size_t width, FixedBorderType border_type) { | ||
| 248 |
4/4✓ Branch 0 taken 3736 times.
✓ Branch 1 taken 150144 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 421056 times.
|
584920 | for (size_t patch_offset = 0; patch_offset < total_patch_span; |
| 249 | 571200 | ++patch_offset) { | |
| 250 | 1142400 | ptrdiff_t valid_w = get_physical_index( | |
| 251 | 571200 | starting_width + patch_offset - kMargin, width, border_type); | |
| 252 | |||
| 253 | 571200 | auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; | |
| 254 | |||
| 255 | 1142400 | scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, | |
| 256 | 571200 | patch_offset); | |
| 257 | 571200 | } | |
| 258 | 13720 | } | |
| 259 | |||
| 260 | // During vertical traversal (the main 'height' loop), each sliding window | ||
| 261 | // iteration introduces a new incoming row and removes an outgoing one. The | ||
| 262 | // histogram must be updated accordingly by subtracting the contributions of | ||
| 263 | // the outgoing row and adding those of the incoming row. | ||
| 264 | // Both increment and decrement operations are handled inside the same | ||
| 265 | // function. | ||
| 266 | 1352416 | void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel, | |
| 267 | size_t channel_idx, size_t patch_length, | ||
| 268 | size_t patch_offset) { | ||
| 269 | 2704832 | const size_t coarse_offset_base = | |
| 270 | 1352416 | 16 * (patch_length * channel_idx + patch_offset); | |
| 271 | 2704832 | const size_t fine_offset_base = | |
| 272 | 1352416 | 16 * patch_length * 16 * channel_idx + 16 * patch_offset; | |
| 273 | |||
| 274 | 1352416 | const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4); | |
| 275 | 1352416 | const size_t mask_new_pixel = (incoming_pixel & 0xF); | |
| 276 | 1352416 | const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4); | |
| 277 | 1352416 | const size_t mask_old_pixel = (outgoing_pixel & 0xF); | |
| 278 | |||
| 279 | 2704832 | const size_t fine_new_offset = fine_offset_base + mask_new_pixel + | |
| 280 | 1352416 | 16 * patch_length * pixel_new_shift_right_4; | |
| 281 | 2704832 | const size_t coarse_new_offset = | |
| 282 | 1352416 | coarse_offset_base + pixel_new_shift_right_4; | |
| 283 | |||
| 284 | 2704832 | const size_t fine_old_offset = fine_offset_base + mask_old_pixel + | |
| 285 | 1352416 | 16 * patch_length * pixel_old_shift_right_4; | |
| 286 | 2704832 | const size_t coarse_old_offset = | |
| 287 | 1352416 | coarse_offset_base + pixel_old_shift_right_4; | |
| 288 | |||
| 289 | 1352416 | patched_coarse[coarse_new_offset]++; | |
| 290 | 1352416 | patched_coarse[coarse_old_offset]--; | |
| 291 | 1352416 | patched_fine[fine_new_offset]++; | |
| 292 | 1352416 | patched_fine[fine_old_offset]--; | |
| 293 | 1352416 | } | |
| 294 | |||
| 295 | 8488 | void vector_update_patch_histogram(uint8x16_t& outgoing_pixels, | |
| 296 | uint8x16_t& incoming_pixels, | ||
| 297 | size_t channel_idx, size_t patch_length, | ||
| 298 | size_t patch_offset) { | ||
| 299 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 300 |
2/2✓ Branch 0 taken 8488 times.
✓ Branch 1 taken 135808 times.
|
144296 | for (int i = 0; i < 16; i++) { |
| 301 | 271616 | const size_t coarse_offset_incoming = | |
| 302 | 271616 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
| 303 | 135808 | (incoming_pixels[i] >> 4); | |
| 304 | |||
| 305 | 271616 | const size_t coarse_offset_outgoing = | |
| 306 | 271616 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
| 307 | 135808 | (outgoing_pixels[i] >> 4); | |
| 308 | |||
| 309 | 271616 | const size_t fine_offset_incoming = | |
| 310 | 407424 | 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + | |
| 311 | 407424 | patch_offset + i) + | |
| 312 | 135808 | (incoming_pixels[i] & 0xF); | |
| 313 | |||
| 314 | 271616 | const size_t fine_offset_outgoing = | |
| 315 | 407424 | 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) + | |
| 316 | 407424 | patch_offset + i) + | |
| 317 | 135808 | (outgoing_pixels[i] & 0xF); | |
| 318 | |||
| 319 | 135808 | patched_coarse[coarse_offset_incoming]++; | |
| 320 | 135808 | patched_coarse[coarse_offset_outgoing]--; | |
| 321 | 135808 | patched_fine[fine_offset_incoming]++; | |
| 322 | 135808 | patched_fine[fine_offset_outgoing]--; | |
| 323 | 135808 | } | |
| 324 | 8488 | } | |
| 325 | |||
| 326 | 10884 | void update_patch_histogram_without_horizontal_borders( | |
| 327 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, | ||
| 328 | ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, | ||
| 329 | size_t kMargin) { | ||
| 330 | 10884 | size_t vector_part = 0; | |
| 331 | if constexpr (is_single_channel) { | ||
| 332 | 2916 | vector_part = (total_patch_span >> 4) << 4; | |
| 333 |
2/2✓ Branch 0 taken 8488 times.
✓ Branch 1 taken 2916 times.
|
11404 | for (size_t patch_offset = 0; patch_offset < vector_part; |
| 334 | 8488 | patch_offset += 16) { | |
| 335 | 16976 | const ptrdiff_t valid_w = | |
| 336 | 8488 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
| 337 | 8488 | auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]); | |
| 338 | 8488 | auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]); | |
| 339 | 16976 | vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c, | |
| 340 | 8488 | total_patch_span, patch_offset); | |
| 341 | 8488 | } | |
| 342 | } | ||
| 343 | |||
| 344 |
4/4✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 33952 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 478080 times.
|
522916 | for (size_t patch_offset = vector_part; patch_offset < total_patch_span; |
| 345 | 512032 | ++patch_offset) { | |
| 346 | 1024064 | const ptrdiff_t valid_w = | |
| 347 | 512032 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
| 348 | 512032 | auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; | |
| 349 | 512032 | auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; | |
| 350 | 1024064 | scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, | |
| 351 | 512032 | total_patch_span, patch_offset); | |
| 352 | 512032 | } | |
| 353 | 10884 | } | |
| 354 | |||
| 355 | 21768 | void update_patch_histogram_with_horizontal_borders( | |
| 356 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, | ||
| 357 | ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, | ||
| 358 | size_t kMargin, size_t width, FixedBorderType border_type) { | ||
| 359 |
4/4✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 219456 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 620928 times.
|
862152 | for (size_t patch_offset = 0; patch_offset < total_patch_span; |
| 360 | 840384 | ++patch_offset) { | |
| 361 | 1680768 | const ptrdiff_t valid_w = get_physical_index( | |
| 362 | 840384 | starting_width + patch_offset - kMargin, width, border_type); | |
| 363 | 840384 | auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; | |
| 364 | 840384 | auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; | |
| 365 | 1680768 | scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, | |
| 366 | 840384 | total_patch_span, patch_offset); | |
| 367 | 840384 | } | |
| 368 | 21768 | } | |
| 369 | |||
| 370 | // `H` and `luc` are used for computing the median value for a single | ||
| 371 | // output pixel: | ||
| 372 | // - `H` is a histogram structure holding both the coarse and fine | ||
| 373 | // bins needed | ||
| 374 | // for the current element's median calculation. | ||
| 375 | // - `luc` is a lookup table that stores the last processed offset | ||
| 376 | // (index) | ||
| 377 | // for each coarse bin. This allows incremental fine histogram | ||
| 378 | // updates instead of full recalculation when histogram overlap is | ||
| 379 | // high. Since neighboring patches in natural images often have | ||
| 380 | // similar pixel values, reusing previous histogram state can | ||
| 381 | // significantly reduce processing time. | ||
| 382 | typedef struct { | ||
| 383 | uint16_t coarse[16]; | ||
| 384 | uint16_t fine[16][16]; | ||
| 385 | } Histogram; | ||
| 386 | |||
| 387 | Histogram H; | ||
| 388 | uint16_t luc[16]; | ||
| 389 | |||
| 390 | 33456 | void clear_lookup_table(void) { | |
| 391 | 33456 | std::memset(&H, 0, sizeof(Histogram)); | |
| 392 | 33456 | std::memset(luc, 0, 16 * sizeof(uint16_t)); | |
| 393 | 33456 | } | |
| 394 | |||
| 395 | 33456 | void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
| 396 | uint16x8_t& v_coarseh, size_t kMargin) { | ||
| 397 |
4/4✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 224832 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 635328 times.
|
893616 | for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) { |
| 398 | 860160 | v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); | |
| 399 | 860160 | v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); | |
| 400 | 860160 | } | |
| 401 | 33456 | } | |
| 402 | |||
| 403 | 663680 | void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
| 404 | uint16x8_t& v_coarseh) { | ||
| 405 | 663680 | v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); | |
| 406 | 663680 | v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); | |
| 407 | 663680 | vst1q_u16(H.coarse, v_coarsel); | |
| 408 | 663680 | vst1q_u16(H.coarse + 8, v_coarseh); | |
| 409 | 663680 | } | |
| 410 | |||
| 411 | 663680 | void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
| 412 | uint16x8_t& v_coarseh) { | ||
| 413 | 663680 | v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px)); | |
| 414 | 663680 | v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8)); | |
| 415 | 663680 | } | |
| 416 | |||
| 417 | 60503 | void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, | |
| 418 | uint16_t& luc, size_t patch_offset, size_t kMargin, | ||
| 419 | size_t total_patch_span) { | ||
| 420 |
4/4✓ Branch 0 taken 414508 times.
✓ Branch 1 taken 16520 times.
✓ Branch 2 taken 1143243 times.
✓ Branch 3 taken 43983 times.
|
1618254 | for (luc = static_cast<uint16_t>(patch_offset - kMargin); |
| 421 | 3236508 | luc < static_cast<uint16_t>( | |
| 422 | 1618254 | std::min(patch_offset + kMargin + 1, total_patch_span)); | |
| 423 | 1557751 | ++luc, px += 16) { | |
| 424 | 1557751 | v_finel = vaddq_u16(v_finel, vld1q_u16(px)); | |
| 425 | 1557751 | v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8)); | |
| 426 | 1557751 | } | |
| 427 | 60503 | } | |
| 428 | |||
| 429 | 603177 | void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, | |
| 430 | uint16_t& luc, size_t patch_offset, size_t kMargin, | ||
| 431 | size_t total_patch_span) { | ||
| 432 |
4/4✓ Branch 0 taken 190121 times.
✓ Branch 1 taken 157560 times.
✓ Branch 2 taken 535241 times.
✓ Branch 3 taken 445617 times.
|
1328539 | for (; luc < static_cast<uint16_t>(patch_offset + kMargin + 1); ++luc) { |
| 433 | 725362 | constexpr ptrdiff_t stride = 16; | |
| 434 | 1450724 | const ptrdiff_t patch_span_limit = | |
| 435 | 725362 | static_cast<ptrdiff_t>(total_patch_span - 1); | |
| 436 | 725362 | const ptrdiff_t safe_luc = static_cast<ptrdiff_t>(luc); | |
| 437 | |||
| 438 | 1450724 | const ptrdiff_t base_offset = | |
| 439 | 725362 | stride * std::min(safe_luc, patch_span_limit); | |
| 440 | 1450724 | const ptrdiff_t old_offset = | |
| 441 | 1450724 | stride * std::max(safe_luc - static_cast<ptrdiff_t>(2 * kMargin + 1), | |
| 442 | 725362 | ptrdiff_t{0}); | |
| 443 | 725362 | const uint16x8_t new_vecl = vld1q_u16(px + base_offset); | |
| 444 | 725362 | const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8); | |
| 445 | 725362 | const uint16x8_t old_vecl = vld1q_u16(px + old_offset); | |
| 446 | 725362 | const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8); | |
| 447 | 725362 | v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl); | |
| 448 | 725362 | v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech); | |
| 449 | 725362 | } | |
| 450 | 603177 | } | |
| 451 | |||
| 452 | 663680 | size_t find_coarse_index(size_t& cdf, size_t median_index) { | |
| 453 | 663680 | size_t coarse_index = 0; | |
| 454 | 5700229 | while (true) { | |
| 455 | 5700229 | cdf += H.coarse[coarse_index]; | |
| 456 |
4/4✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1320228 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3716321 times.
|
5700229 | if (cdf > median_index) { |
| 457 | 663680 | cdf -= H.coarse[coarse_index]; | |
| 458 | 663680 | break; | |
| 459 | } | ||
| 460 | 5036549 | coarse_index++; | |
| 461 | } | ||
| 462 | 1327360 | return coarse_index; | |
| 463 | 663680 | } | |
| 464 | |||
| 465 | 663680 | uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) { | |
| 466 | 663680 | uint16_t* segment = H.fine[coarse_index]; | |
| 467 | 663680 | size_t fine_index = 0; | |
| 468 | 5347359 | while (true) { | |
| 469 | 5347359 | cdf += segment[fine_index]; | |
| 470 |
4/4✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1233097 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3450582 times.
|
5347359 | if (cdf > median_index) { |
| 471 | 663680 | fine_index = (16 * coarse_index + fine_index); | |
| 472 | 663680 | break; | |
| 473 | } | ||
| 474 | 4683679 | fine_index++; | |
| 475 | } | ||
| 476 | 1327360 | return uint8_t(fine_index); | |
| 477 | 663680 | } | |
| 478 | |||
| 479 | 33456 | void compute_patch_median_from_histogram(size_t h, ptrdiff_t c, | |
| 480 | size_t total_patch_span, | ||
| 481 | size_t kMargin, size_t ksize, | ||
| 482 | Rows<uint8_t> dst) { | ||
| 483 | 33456 | uint16x8_t v_coarsel = vld1q_u16(H.coarse); | |
| 484 | 33456 | uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8); | |
| 485 | |||
| 486 | // Before starting the main patch loop to compute medians for each element, | ||
| 487 | // we initialize the coarse histogram buffer with the first (ksize - 1). | ||
| 488 | // This allows each subsequent iteration in the patch loop to perform only | ||
| 489 | // one addition and one subtraction. | ||
| 490 | 33456 | uint16_t* px = patched_coarse + 16 * total_patch_span * c; | |
| 491 | 33456 | initialize_coarse(px, v_coarsel, v_coarseh, kMargin); | |
| 492 | |||
| 493 |
4/4✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 174080 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 489600 times.
|
697136 | for (size_t patch_offset = kMargin; |
| 494 | 697136 | patch_offset < total_patch_span - kMargin; patch_offset++) { | |
| 495 | 663680 | size_t median_index = (ksize * ksize) / 2, cdf = 0; | |
| 496 | |||
| 497 | 1327360 | px = patched_coarse + | |
| 498 | 1327360 | 16 * (total_patch_span * c + | |
| 499 | 663680 | std::min(patch_offset + kMargin, total_patch_span - 1)); | |
| 500 | 663680 | increment_coarse(px, v_coarsel, v_coarseh); | |
| 501 | |||
| 502 | // Find median at coarse level | ||
| 503 | 663680 | size_t coarse_index = find_coarse_index(cdf, median_index); | |
| 504 | |||
| 505 | 663680 | uint16x8_t v_finel; | |
| 506 | 663680 | uint16x8_t v_fineh; | |
| 507 | // Check whether the fine histogram (H.fine[coarse_index]) for the | ||
| 508 | // current patch position needs to be freshly initialized or can be | ||
| 509 | // incrementally updated. This decision hinges on the `luc` (Last Used | ||
| 510 | // Coordinate) table, which records the last horizontal patch offset | ||
| 511 | // processed for each coarse bin. | ||
| 512 | // | ||
| 513 | // The condition is true in two scenarios: | ||
| 514 | // | ||
| 515 | // 1. **First-Time Initialization**: | ||
| 516 | // - This is the first time we are accessing this coarse bin | ||
| 517 | // (`coarse_index`) at the current patch position. | ||
| 518 | // - We compute the full fine histogram from scratch by summing | ||
| 519 | // over `ksize` rows centered at the patch position. | ||
| 520 | // - We accumulate the results into the `v_finel` and `v_fineh` | ||
| 521 | // vector registers. | ||
| 522 | // - These vectors are then stored into `H.fine[coarse_index]`. | ||
| 523 | // - The `luc` table is updated to reflect the last index | ||
| 524 | // processed. | ||
| 525 | // | ||
| 526 | // 2. **Window Movement Causes Loss of Overlap**: | ||
| 527 | // - The sliding window has moved enough that it no longer | ||
| 528 | // sufficiently overlaps the region | ||
| 529 | // used to compute the previously cached fine histogram (i.e., | ||
| 530 | // `luc[coarse_index]` is too far behind). | ||
| 531 | // - We must reinitialize the fine histogram to ensure accuracy. | ||
| 532 | // | ||
| 533 | // Otherwise: | ||
| 534 | // | ||
| 535 | // - We reuse the previously computed fine histogram stored in | ||
| 536 | // `H.fine[coarse_index]`. | ||
| 537 | // - We only update it incrementally using the `update_fine()` | ||
| 538 | // function, which: | ||
| 539 | // - Adds the new values entering the window. | ||
| 540 | // - Subtracts the values leaving the window. | ||
| 541 | // - This avoids the need for a full re-scan, leveraging temporal | ||
| 542 | // locality between neighboring pixels. | ||
| 543 | // | ||
| 544 | // This lookup-based optimization significantly improves performance, | ||
| 545 | // as neighboring filter windows often overlap heavily—especially for | ||
| 546 | // small strides and moderate kernel sizes. | ||
| 547 | // | ||
| 548 | // After this step, the fine histogram is ready. The next phase scans | ||
| 549 | // `H.fine[coarse_index]` to identify the fine bin where the | ||
| 550 | // cumulative sum crosses the median threshold. This gives us the | ||
| 551 | // final median value for the output pixel. | ||
| 552 |
4/4✓ Branch 0 taken 16520 times.
✓ Branch 1 taken 157560 times.
✓ Branch 2 taken 43983 times.
✓ Branch 3 taken 445617 times.
|
663680 | if (luc[coarse_index] <= patch_offset - kMargin) { |
| 553 | 60503 | v_finel = vdupq_n_u16(0); | |
| 554 | 60503 | v_fineh = vdupq_n_u16(0); | |
| 555 | 121006 | px = patched_fine + static_cast<ptrdiff_t>(16) * | |
| 556 | 121006 | (static_cast<ptrdiff_t>(total_patch_span) * | |
| 557 | 121006 | (16 * c + coarse_index) + | |
| 558 | 121006 | patch_offset - kMargin); | |
| 559 | |||
| 560 | 121006 | initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, | |
| 561 | 60503 | kMargin, total_patch_span); | |
| 562 | |||
| 563 | 60503 | } else { | |
| 564 | 603177 | v_finel = vld1q_u16(H.fine[coarse_index]); | |
| 565 | 603177 | v_fineh = vld1q_u16(H.fine[coarse_index] + 8); | |
| 566 | 603177 | px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index); | |
| 567 | 1206354 | update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, | |
| 568 | 603177 | kMargin, total_patch_span); | |
| 569 | } | ||
| 570 | |||
| 571 | 1327360 | px = patched_coarse + | |
| 572 | 663680 | static_cast<ptrdiff_t>(16) * | |
| 573 | 1327360 | (total_patch_span * c + | |
| 574 | 663680 | std::max(patch_offset - kMargin, static_cast<size_t>(0))); | |
| 575 | |||
| 576 | 663680 | vst1q_u16(H.fine[coarse_index], v_finel); | |
| 577 | 663680 | vst1q_u16(H.fine[coarse_index] + 8, v_fineh); | |
| 578 | |||
| 579 | 663680 | decrement_coarse(px, v_coarsel, v_coarseh); | |
| 580 | |||
| 581 | // Find median at fine level | ||
| 582 | 2654720 | dst.at(static_cast<ptrdiff_t>(h), | |
| 583 | 1991040 | static_cast<ptrdiff_t>(patch_offset - kMargin))[c] = | |
| 584 | 663680 | find_median(cdf, median_index, coarse_index); | |
| 585 | 663680 | } | |
| 586 | 33456 | } | |
| 587 | }; | ||
| 588 | |||
| 589 | template <bool is_single_channel> | ||
| 590 | 140 | void median_process(Rectangle image_dimensions, Rows<const uint8_t> src_rows, | |
| 591 | Rows<uint8_t> dst_rows, size_t y_begin, size_t y_end, | ||
| 592 | size_t kernel_width, size_t kernel_height, | ||
| 593 | FixedBorderType border_type) { | ||
| 594 | 140 | size_t kMargin = (kernel_width - 1) / 2; | |
| 595 | 280 | MedianBlurLargeHist<is_single_channel> median_filter{src_rows.channels(), | |
| 596 | 140 | kMargin}; | |
| 597 | |||
| 598 | // Process left border | ||
| 599 | 140 | size_t starting_width = 0; | |
| 600 | 140 | size_t processing_left_width = kMargin; | |
| 601 | 140 | Point starting_left_coordinates{starting_width, y_begin}; | |
| 602 | 140 | Point ending_left_coordinates{starting_width + processing_left_width, y_end}; | |
| 603 | 140 | median_filter.process_pixels_with_horizontal_borders( | |
| 604 | 140 | image_dimensions, starting_left_coordinates, ending_left_coordinates, | |
| 605 | 140 | src_rows, dst_rows, kernel_height, border_type); | |
| 606 | |||
| 607 | // Process center region | ||
| 608 | 140 | starting_width = processing_left_width; | |
| 609 | 140 | size_t processing_center_width = image_dimensions.width() - 2 * kMargin; | |
| 610 | 140 | Point starting_center_coordinates{starting_width, y_begin}; | |
| 611 | 280 | Point ending_center_coordinates{starting_width + processing_center_width, | |
| 612 | 140 | y_end}; | |
| 613 | 140 | median_filter.process_pixels_without_horizontal_borders( | |
| 614 | 140 | image_dimensions, starting_center_coordinates, ending_center_coordinates, | |
| 615 | 140 | src_rows, dst_rows, kernel_height, border_type); | |
| 616 | |||
| 617 | // Process right border | ||
| 618 | 140 | starting_width = processing_left_width + processing_center_width; | |
| 619 | 420 | size_t processing_right_width = image_dimensions.width() - | |
| 620 | 280 | processing_left_width - | |
| 621 | 140 | processing_center_width; | |
| 622 | 140 | Point starting_right_coordinates{starting_width, y_begin}; | |
| 623 | 280 | Point ending_right_coordinates{starting_width + processing_right_width, | |
| 624 | 140 | y_end}; | |
| 625 | 140 | median_filter.process_pixels_with_horizontal_borders( | |
| 626 | 140 | image_dimensions, starting_right_coordinates, ending_right_coordinates, | |
| 627 | 140 | src_rows, dst_rows, kernel_height, border_type); | |
| 628 | 140 | } | |
| 629 | |||
| 630 | 140 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8( | |
| 631 | const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, | ||
| 632 | size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
| 633 | size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { | ||
| 634 | 140 | Rectangle image_dimensions{width, height}; | |
| 635 | 140 | Rows<const uint8_t> src_rows{src, src_stride, channels}; | |
| 636 | 140 | Rows<uint8_t> dst_rows{dst, dst_stride, channels}; | |
| 637 | |||
| 638 |
2/2✓ Branch 0 taken 76 times.
✓ Branch 1 taken 64 times.
|
140 | if (channels == 1) { |
| 639 | 152 | median_process<true>(image_dimensions, src_rows, dst_rows, y_begin, y_end, | |
| 640 | 76 | kernel_width, kernel_height, border_type); | |
| 641 | 76 | } else { | |
| 642 | 128 | median_process<false>(image_dimensions, src_rows, dst_rows, y_begin, y_end, | |
| 643 | 64 | kernel_width, kernel_height, border_type); | |
| 644 | } | ||
| 645 | |||
| 646 | 140 | return KLEIDICV_OK; | |
| 647 | 140 | } | |
| 648 | |||
| 649 | } // namespace kleidicv::neon | ||
| 650 |