Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/ctypes.h" | ||
6 | #include "kleidicv/filters/median_blur.h" | ||
7 | #include "kleidicv/kleidicv.h" | ||
8 | #include "kleidicv/neon.h" | ||
9 | #include "median_blur_border_handling.h" | ||
10 | |||
11 | namespace kleidicv::neon { | ||
12 | |||
13 | template <bool is_single_channel> | ||
14 | class MedianBlurLargeHist { | ||
15 | public: | ||
16 | 105 | MedianBlurLargeHist(size_t channels, size_t kMargin) | |
17 | 210 | : patched_coarse{static_cast<uint16_t*>( | |
18 | 105 | malloc((16 + 256) * (patch_length + 2 * kMargin) * channels * | |
19 | sizeof(uint16_t)))}, | ||
20 | 210 | patched_fine{ | |
21 | 105 | &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]}, | |
22 | 105 | H{}, | |
23 | 105 | luc{} {} | |
24 | |||
25 | 105 | ~MedianBlurLargeHist() { free(patched_coarse); } | |
26 | |||
27 | 105 | void process_pixels_without_horizontal_borders( | |
28 | Rectangle image_dimensions, Point starting_coordinates, | ||
29 | Point ending_coordinates, Rows<const uint8_t> src_rows, | ||
30 | Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) { | ||
31 | 105 | const size_t kMargin = ksize / 2; | |
32 | |||
33 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 48 times.
|
210 | for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); |
34 | 105 | w += patch_length) { | |
35 | 210 | const size_t total_patch_span = | |
36 | 105 | std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; | |
37 | |||
38 | 210 | Rows<uint8_t> shifted_dst{ | |
39 | 105 | &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(), | |
40 | 105 | dst_rows.stride(), dst_rows.channels()}; | |
41 | |||
42 | 105 | clear_patched_histogram(total_patch_span * src_rows.channels()); | |
43 | |||
44 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 144 times.
✓ Branch 3 taken 48 times.
|
306 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
45 | 201 | ++c) { | |
46 | 201 | clear_lookup_table(); | |
47 | |||
48 | // We initialize with ksize rows to allow merging of | ||
49 | // histogram increment and decrement operations in the main loop. | ||
50 | // This extra initial load enables a single update phase and avoids | ||
51 | // splitting the logic into separate steps. | ||
52 |
4/4✓ Branch 0 taken 1401 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 3744 times.
✓ Branch 3 taken 144 times.
|
5346 | for (size_t r = 0; r < ksize; ++r) { |
53 | 10290 | const ptrdiff_t valid_h = | |
54 | 10290 | get_physical_index(starting_coordinates.y() + r - kMargin, | |
55 | 5145 | image_dimensions.height(), border_type); | |
56 | 5145 | initialize_patched_histogram_without_horizontal_borders( | |
57 | 5145 | src_rows, c, valid_h, total_patch_span, w, kMargin); | |
58 | 5145 | } | |
59 | 402 | compute_patch_median_from_histogram(starting_coordinates.y(), c, | |
60 | 201 | total_patch_span, kMargin, ksize, | |
61 | 201 | shifted_dst); | |
62 | 201 | } | |
63 | |||
64 |
4/4✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 1992 times.
✓ Branch 3 taken 48 times.
|
4284 | for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); |
65 | 4179 | ++h) { | |
66 | 8358 | const ptrdiff_t valid_old_h = get_physical_index( | |
67 | 4179 | h - kMargin - 1, image_dimensions.height(), border_type); | |
68 | |||
69 | 8358 | const ptrdiff_t valid_new_h = get_physical_index( | |
70 | 4179 | h + kMargin, image_dimensions.height(), border_type); | |
71 | |||
72 |
4/4✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 2187 times.
✓ Branch 2 taken 5976 times.
✓ Branch 3 taken 1992 times.
|
12342 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
73 | 8163 | ++c) { | |
74 | 8163 | clear_lookup_table(); | |
75 | |||
76 | 8163 | update_patch_histogram_without_horizontal_borders( | |
77 | 8163 | src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, | |
78 | 8163 | kMargin); | |
79 | |||
80 | 16326 | compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, | |
81 | 8163 | ksize, shifted_dst); | |
82 | 8163 | } | |
83 | 4179 | } | |
84 | 105 | } | |
85 | 105 | } | |
86 | |||
87 | 210 | void process_pixels_with_horizontal_borders( | |
88 | Rectangle image_dimensions, Point starting_coordinates, | ||
89 | Point ending_coordinates, Rows<const uint8_t> src_rows, | ||
90 | Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) { | ||
91 | 210 | const size_t kMargin = ksize / 2; | |
92 | |||
93 |
4/4✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 96 times.
✓ Branch 3 taken 96 times.
|
420 | for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); |
94 | 210 | w += patch_length) { | |
95 | 420 | const size_t total_patch_span = | |
96 | 210 | std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2; | |
97 | 420 | Rows<uint8_t> shifted_dst{ | |
98 | 210 | &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(), | |
99 | 210 | dst_rows.stride(), dst_rows.channels()}; | |
100 | |||
101 | 210 | clear_patched_histogram(total_patch_span * src_rows.channels()); | |
102 | |||
103 |
4/4✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 96 times.
|
612 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
104 | 402 | ++c) { | |
105 | 402 | clear_lookup_table(); | |
106 | // We initialize with ksize rows to allow merging of | ||
107 | // histogram increment and decrement operations in the main loop. | ||
108 | // This extra initial load enables a single update phase and avoids | ||
109 | // splitting the logic into separate steps. | ||
110 |
4/4✓ Branch 0 taken 2802 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 7488 times.
✓ Branch 3 taken 288 times.
|
10692 | for (size_t r = 0; r < ksize; ++r) { |
111 | 20580 | const ptrdiff_t valid_h = | |
112 | 20580 | get_physical_index(starting_coordinates.y() + r - kMargin, | |
113 | 10290 | image_dimensions.height(), border_type); | |
114 | 10290 | initialize_patched_histogram_with_horizontal_borders( | |
115 | 10290 | src_rows, c, valid_h, total_patch_span, w, kMargin, | |
116 | 10290 | image_dimensions.width(), border_type); | |
117 | 10290 | } | |
118 | 804 | compute_patch_median_from_histogram(starting_coordinates.y(), c, | |
119 | 402 | total_patch_span, kMargin, ksize, | |
120 | 402 | shifted_dst); | |
121 | 402 | } | |
122 | |||
123 |
4/4✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 3984 times.
✓ Branch 3 taken 96 times.
|
8568 | for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y(); |
124 | 8358 | ++h) { | |
125 | 16716 | const ptrdiff_t valid_old_h = get_physical_index( | |
126 | 8358 | h - kMargin - 1, image_dimensions.height(), border_type); | |
127 | 16716 | const ptrdiff_t valid_new_h = get_physical_index( | |
128 | 8358 | h + kMargin, image_dimensions.height(), border_type); | |
129 |
4/4✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 4374 times.
✓ Branch 2 taken 11952 times.
✓ Branch 3 taken 3984 times.
|
24684 | for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels()); |
130 | 16326 | ++c) { | |
131 | 16326 | clear_lookup_table(); | |
132 | |||
133 | 16326 | update_patch_histogram_with_horizontal_borders( | |
134 | 16326 | src_rows, c, valid_old_h, valid_new_h, total_patch_span, w, | |
135 | 16326 | kMargin, image_dimensions.width(), border_type); | |
136 | |||
137 | 32652 | compute_patch_median_from_histogram(h, c, total_patch_span, kMargin, | |
138 | 16326 | ksize, shifted_dst); | |
139 | 16326 | } | |
140 | 8358 | } | |
141 | 210 | } | |
142 | 210 | } | |
143 | |||
144 | private: | ||
145 | // Histogram Buffer Layout Explanation | ||
146 | // ----------------------------------- | ||
147 | // patched_coarse: | ||
148 | // - Conceptually a 3D array: | ||
149 | // coarse[channel_idx][patch_offset][coarse_bin] | ||
150 | // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] | ||
151 | // - Flattened as: | ||
152 | // coarse_offset = 16 * (patch_length * channel_idx + patch_offset) | ||
153 | // + (incoming_pixel >> 4); | ||
154 | // | ||
155 | // patched_fine: | ||
156 | // - Conceptually a 4D array: | ||
157 | // fine[channel_idx][coarse_bin][patch_offset][fine_bin] | ||
158 | // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15] | ||
159 | // - fine_bin = incoming_pixel & 0xF ∈ [0, 15] | ||
160 | // - Flattened as: | ||
161 | // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin) | ||
162 | // + patch_offset) + fine_bin; | ||
163 | // | ||
164 | // This layout enables fast linear access while preserving the hierarchical | ||
165 | // structure of histograms per channel and patch position. | ||
166 | constexpr static size_t patch_length = 512; | ||
167 | uint16_t* patched_coarse; | ||
168 | uint16_t* patched_fine; | ||
169 | |||
170 | // Clear only the portion of the patched histogram buffers that will be used | ||
171 | // for the current patch. | ||
172 | // Since these buffers are large, there's no need to zero out the entire | ||
173 | // allocation— only the section relevant to the current total patch size is | ||
174 | // cleared for efficiency. | ||
175 | 315 | void clear_patched_histogram(size_t total_patch_size) { | |
176 | 630 | std::memset(patched_coarse, 0, | |
177 | 315 | 16 * total_patch_size * sizeof(patched_coarse[0])); | |
178 | 630 | std::memset(patched_fine, 0, | |
179 | 315 | 256 * total_patch_size * sizeof(patched_fine[0])); | |
180 | 315 | } | |
181 | |||
182 | 669240 | void scalar_initialize_patched_histogram(int incoming_pixel, | |
183 | size_t channel_idx, | ||
184 | size_t patch_length, | ||
185 | size_t patch_offset) { | ||
186 | 1338480 | const size_t coarse_offset = | |
187 | 1338480 | 16 * (patch_length * channel_idx + patch_offset) + | |
188 | 669240 | (incoming_pixel >> 4); | |
189 | 1338480 | const size_t fine_offset = | |
190 | 1338480 | 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) + | |
191 | 1338480 | patch_offset) + | |
192 | 669240 | (incoming_pixel & 0xF); | |
193 | 669240 | patched_coarse[coarse_offset]++; | |
194 | 669240 | patched_fine[fine_offset]++; | |
195 | 669240 | } | |
196 | |||
197 | 4050 | void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels, | |
198 | size_t channel_idx, | ||
199 | size_t patch_length, | ||
200 | size_t patch_offset) { | ||
201 | KLEIDICV_FORCE_LOOP_UNROLL | ||
202 |
2/2✓ Branch 0 taken 4050 times.
✓ Branch 1 taken 64800 times.
|
68850 | for (int i = 0; i < 16; i++) { |
203 | 129600 | const size_t coarse_offset_incoming = | |
204 | 129600 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
205 | 64800 | (incoming_pixels[i] >> 4); | |
206 | |||
207 | 129600 | const size_t fine_offset_incoming = | |
208 | 194400 | 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + | |
209 | 194400 | patch_offset + i) + | |
210 | 64800 | (incoming_pixels[i] & 0xF); | |
211 | |||
212 | 64800 | patched_coarse[coarse_offset_incoming]++; | |
213 | 64800 | patched_fine[fine_offset_incoming]++; | |
214 | 64800 | } | |
215 | 4050 | } | |
216 | |||
217 | 5145 | void initialize_patched_histogram_without_horizontal_borders( | |
218 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h, | ||
219 | size_t total_patch_span, size_t starting_width, size_t kMargin) { | ||
220 | 5145 | size_t vector_part = 0; | |
221 | if constexpr (is_single_channel) { | ||
222 | 1401 | vector_part = (total_patch_span >> 4) << 4; | |
223 |
2/2✓ Branch 0 taken 4050 times.
✓ Branch 1 taken 1401 times.
|
5451 | for (size_t patch_offset = 0; patch_offset < vector_part; |
224 | 4050 | patch_offset += 16) { | |
225 | 8100 | const ptrdiff_t valid_w = | |
226 | 4050 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
227 | 4050 | auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]); | |
228 | 8100 | vector_initialize_patched_histogram(incoming_pixels, c, | |
229 | 4050 | total_patch_span, patch_offset); | |
230 | 4050 | } | |
231 | } | ||
232 | |||
233 |
4/4✓ Branch 0 taken 1401 times.
✓ Branch 1 taken 16200 times.
✓ Branch 2 taken 3744 times.
✓ Branch 3 taken 224640 times.
|
245985 | for (size_t patch_offset = vector_part; patch_offset < total_patch_span; |
234 | 240840 | ++patch_offset) { | |
235 | 481680 | const ptrdiff_t valid_w = | |
236 | 240840 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
237 | 240840 | auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; | |
238 | |||
239 | 481680 | scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, | |
240 | 240840 | patch_offset); | |
241 | 240840 | } | |
242 | 5145 | } | |
243 | |||
244 | 10290 | void initialize_patched_histogram_with_horizontal_borders( | |
245 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h, | ||
246 | size_t total_patch_span, size_t starting_width, size_t kMargin, | ||
247 | size_t width, FixedBorderType border_type) { | ||
248 |
4/4✓ Branch 0 taken 2802 times.
✓ Branch 1 taken 112608 times.
✓ Branch 2 taken 7488 times.
✓ Branch 3 taken 315792 times.
|
438690 | for (size_t patch_offset = 0; patch_offset < total_patch_span; |
249 | 428400 | ++patch_offset) { | |
250 | 856800 | ptrdiff_t valid_w = get_physical_index( | |
251 | 428400 | starting_width + patch_offset - kMargin, width, border_type); | |
252 | |||
253 | 428400 | auto incoming_pixel = src_rows.at(valid_h, valid_w)[c]; | |
254 | |||
255 | 856800 | scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span, | |
256 | 428400 | patch_offset); | |
257 | 428400 | } | |
258 | 10290 | } | |
259 | |||
260 | // During vertical traversal (the main 'height' loop), each sliding window | ||
261 | // iteration introduces a new incoming row and removes an outgoing one. The | ||
262 | // histogram must be updated accordingly by subtracting the contributions of | ||
263 | // the outgoing row and adding those of the incoming row. | ||
264 | // Both increment and decrement operations are handled inside the same | ||
265 | // function. | ||
266 | 1014312 | void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel, | |
267 | size_t channel_idx, size_t patch_length, | ||
268 | size_t patch_offset) { | ||
269 | 2028624 | const size_t coarse_offset_base = | |
270 | 1014312 | 16 * (patch_length * channel_idx + patch_offset); | |
271 | 2028624 | const size_t fine_offset_base = | |
272 | 1014312 | 16 * patch_length * 16 * channel_idx + 16 * patch_offset; | |
273 | |||
274 | 1014312 | const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4); | |
275 | 1014312 | const size_t mask_new_pixel = (incoming_pixel & 0xF); | |
276 | 1014312 | const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4); | |
277 | 1014312 | const size_t mask_old_pixel = (outgoing_pixel & 0xF); | |
278 | |||
279 | 2028624 | const size_t fine_new_offset = fine_offset_base + mask_new_pixel + | |
280 | 1014312 | 16 * patch_length * pixel_new_shift_right_4; | |
281 | 2028624 | const size_t coarse_new_offset = | |
282 | 1014312 | coarse_offset_base + pixel_new_shift_right_4; | |
283 | |||
284 | 2028624 | const size_t fine_old_offset = fine_offset_base + mask_old_pixel + | |
285 | 1014312 | 16 * patch_length * pixel_old_shift_right_4; | |
286 | 2028624 | const size_t coarse_old_offset = | |
287 | 1014312 | coarse_offset_base + pixel_old_shift_right_4; | |
288 | |||
289 | 1014312 | patched_coarse[coarse_new_offset]++; | |
290 | 1014312 | patched_coarse[coarse_old_offset]--; | |
291 | 1014312 | patched_fine[fine_new_offset]++; | |
292 | 1014312 | patched_fine[fine_old_offset]--; | |
293 | 1014312 | } | |
294 | |||
295 | 6366 | void vector_update_patch_histogram(uint8x16_t& outgoing_pixels, | |
296 | uint8x16_t& incoming_pixels, | ||
297 | size_t channel_idx, size_t patch_length, | ||
298 | size_t patch_offset) { | ||
299 | KLEIDICV_FORCE_LOOP_UNROLL | ||
300 |
2/2✓ Branch 0 taken 6366 times.
✓ Branch 1 taken 101856 times.
|
108222 | for (int i = 0; i < 16; i++) { |
301 | 203712 | const size_t coarse_offset_incoming = | |
302 | 203712 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
303 | 101856 | (incoming_pixels[i] >> 4); | |
304 | |||
305 | 203712 | const size_t coarse_offset_outgoing = | |
306 | 203712 | 16 * (patch_length * channel_idx + patch_offset + i) + | |
307 | 101856 | (outgoing_pixels[i] >> 4); | |
308 | |||
309 | 203712 | const size_t fine_offset_incoming = | |
310 | 305568 | 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) + | |
311 | 305568 | patch_offset + i) + | |
312 | 101856 | (incoming_pixels[i] & 0xF); | |
313 | |||
314 | 203712 | const size_t fine_offset_outgoing = | |
315 | 305568 | 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) + | |
316 | 305568 | patch_offset + i) + | |
317 | 101856 | (outgoing_pixels[i] & 0xF); | |
318 | |||
319 | 101856 | patched_coarse[coarse_offset_incoming]++; | |
320 | 101856 | patched_coarse[coarse_offset_outgoing]--; | |
321 | 101856 | patched_fine[fine_offset_incoming]++; | |
322 | 101856 | patched_fine[fine_offset_outgoing]--; | |
323 | 101856 | } | |
324 | 6366 | } | |
325 | |||
326 | 8163 | void update_patch_histogram_without_horizontal_borders( | |
327 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, | ||
328 | ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, | ||
329 | size_t kMargin) { | ||
330 | 8163 | size_t vector_part = 0; | |
331 | if constexpr (is_single_channel) { | ||
332 | 2187 | vector_part = (total_patch_span >> 4) << 4; | |
333 |
2/2✓ Branch 0 taken 6366 times.
✓ Branch 1 taken 2187 times.
|
8553 | for (size_t patch_offset = 0; patch_offset < vector_part; |
334 | 6366 | patch_offset += 16) { | |
335 | 12732 | const ptrdiff_t valid_w = | |
336 | 6366 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
337 | 6366 | auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]); | |
338 | 6366 | auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]); | |
339 | 12732 | vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c, | |
340 | 6366 | total_patch_span, patch_offset); | |
341 | 6366 | } | |
342 | } | ||
343 | |||
344 |
4/4✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 25464 times.
✓ Branch 2 taken 5976 times.
✓ Branch 3 taken 358560 times.
|
392187 | for (size_t patch_offset = vector_part; patch_offset < total_patch_span; |
345 | 384024 | ++patch_offset) { | |
346 | 768048 | const ptrdiff_t valid_w = | |
347 | 384024 | static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin); | |
348 | 384024 | auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; | |
349 | 384024 | auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; | |
350 | 768048 | scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, | |
351 | 384024 | total_patch_span, patch_offset); | |
352 | 384024 | } | |
353 | 8163 | } | |
354 | |||
355 | 16326 | void update_patch_histogram_with_horizontal_borders( | |
356 | Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h, | ||
357 | ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width, | ||
358 | size_t kMargin, size_t width, FixedBorderType border_type) { | ||
359 |
4/4✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 164592 times.
✓ Branch 2 taken 11952 times.
✓ Branch 3 taken 465696 times.
|
646614 | for (size_t patch_offset = 0; patch_offset < total_patch_span; |
360 | 630288 | ++patch_offset) { | |
361 | 1260576 | const ptrdiff_t valid_w = get_physical_index( | |
362 | 630288 | starting_width + patch_offset - kMargin, width, border_type); | |
363 | 630288 | auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c]; | |
364 | 630288 | auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c]; | |
365 | 1260576 | scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c, | |
366 | 630288 | total_patch_span, patch_offset); | |
367 | 630288 | } | |
368 | 16326 | } | |
369 | |||
370 | // `H` and `luc` are used for computing the median value for a single | ||
371 | // output pixel: | ||
372 | // - `H` is a histogram structure holding both the coarse and fine | ||
373 | // bins needed | ||
374 | // for the current element's median calculation. | ||
375 | // - `luc` is a lookup table that stores the last processed offset | ||
376 | // (index) | ||
377 | // for each coarse bin. This allows incremental fine histogram | ||
378 | // updates instead of full recalculation when histogram overlap is | ||
379 | // high. Since neighboring patches in natural images often have | ||
380 | // similar pixel values, reusing previous histogram state can | ||
381 | // significantly reduce processing time. | ||
382 | typedef struct { | ||
383 | uint16_t coarse[16]; | ||
384 | uint16_t fine[16][16]; | ||
385 | } Histogram; | ||
386 | |||
387 | Histogram H; | ||
388 | uint16_t luc[16]; | ||
389 | |||
390 | 25092 | void clear_lookup_table(void) { | |
391 | 25092 | std::memset(&H, 0, sizeof(Histogram)); | |
392 | 25092 | std::memset(luc, 0, 16 * sizeof(uint16_t)); | |
393 | 25092 | } | |
394 | |||
395 | 25092 | void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
396 | uint16x8_t& v_coarseh, size_t kMargin) { | ||
397 |
4/4✓ Branch 0 taken 6732 times.
✓ Branch 1 taken 168624 times.
✓ Branch 2 taken 18360 times.
✓ Branch 3 taken 476496 times.
|
670212 | for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) { |
398 | 645120 | v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); | |
399 | 645120 | v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); | |
400 | 645120 | } | |
401 | 25092 | } | |
402 | |||
403 | 497760 | void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
404 | uint16x8_t& v_coarseh) { | ||
405 | 497760 | v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px)); | |
406 | 497760 | v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8)); | |
407 | 497760 | vst1q_u16(H.coarse, v_coarsel); | |
408 | 497760 | vst1q_u16(H.coarse + 8, v_coarseh); | |
409 | 497760 | } | |
410 | |||
411 | 497760 | void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel, | |
412 | uint16x8_t& v_coarseh) { | ||
413 | 497760 | v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px)); | |
414 | 497760 | v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8)); | |
415 | 497760 | } | |
416 | |||
417 | 46404 | void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, | |
418 | uint16_t& luc, size_t patch_offset, size_t kMargin, | ||
419 | size_t total_patch_span) { | ||
420 |
4/4✓ Branch 0 taken 316942 times.
✓ Branch 1 taken 12674 times.
✓ Branch 2 taken 868610 times.
✓ Branch 3 taken 33730 times.
|
1231956 | for (luc = static_cast<uint16_t>(patch_offset - kMargin); |
421 | 2463912 | luc < static_cast<uint16_t>( | |
422 | 1231956 | std::min(patch_offset + kMargin + 1, total_patch_span)); | |
423 | 1185552 | ++luc, px += 16) { | |
424 | 1185552 | v_finel = vaddq_u16(v_finel, vld1q_u16(px)); | |
425 | 1185552 | v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8)); | |
426 | 1185552 | } | |
427 | 46404 | } | |
428 | |||
429 | 451356 | void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh, | |
430 | uint16_t& luc, size_t patch_offset, size_t kMargin, | ||
431 | size_t total_patch_span) { | ||
432 |
4/4✓ Branch 0 taken 139342 times.
✓ Branch 1 taken 117886 times.
✓ Branch 2 taken 398789 times.
✓ Branch 3 taken 333470 times.
|
989487 | for (; luc < static_cast<uint16_t>(patch_offset + kMargin + 1); ++luc) { |
433 | 538131 | constexpr ptrdiff_t stride = 16; | |
434 | 1076262 | const ptrdiff_t patch_span_limit = | |
435 | 538131 | static_cast<ptrdiff_t>(total_patch_span - 1); | |
436 | 538131 | const ptrdiff_t safe_luc = static_cast<ptrdiff_t>(luc); | |
437 | |||
438 | 1076262 | const ptrdiff_t base_offset = | |
439 | 538131 | stride * std::min(safe_luc, patch_span_limit); | |
440 | 1076262 | const ptrdiff_t old_offset = | |
441 | 1076262 | stride * std::max(safe_luc - static_cast<ptrdiff_t>(2 * kMargin + 1), | |
442 | 538131 | ptrdiff_t{0}); | |
443 | 538131 | const uint16x8_t new_vecl = vld1q_u16(px + base_offset); | |
444 | 538131 | const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8); | |
445 | 538131 | const uint16x8_t old_vecl = vld1q_u16(px + old_offset); | |
446 | 538131 | const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8); | |
447 | 538131 | v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl); | |
448 | 538131 | v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech); | |
449 | 538131 | } | |
450 | 451356 | } | |
451 | |||
452 | 497760 | size_t find_coarse_index(size_t& cdf, size_t median_index) { | |
453 | 497760 | size_t coarse_index = 0; | |
454 | 4205933 | while (true) { | |
455 | 4205933 | cdf += H.coarse[coarse_index]; | |
456 |
4/4✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 974526 times.
✓ Branch 2 taken 367200 times.
✓ Branch 3 taken 2733647 times.
|
4205933 | if (cdf > median_index) { |
457 | 497760 | cdf -= H.coarse[coarse_index]; | |
458 | 497760 | break; | |
459 | } | ||
460 | 3708173 | coarse_index++; | |
461 | } | ||
462 | 995520 | return coarse_index; | |
463 | 497760 | } | |
464 | |||
465 | 497760 | uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) { | |
466 | 497760 | uint16_t* segment = H.fine[coarse_index]; | |
467 | 497760 | size_t fine_index = 0; | |
468 | 4340272 | while (true) { | |
469 | 4340272 | cdf += segment[fine_index]; | |
470 |
4/4✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 979159 times.
✓ Branch 2 taken 367200 times.
✓ Branch 3 taken 2863353 times.
|
4340272 | if (cdf > median_index) { |
471 | 497760 | fine_index = (16 * coarse_index + fine_index); | |
472 | 497760 | break; | |
473 | } | ||
474 | 3842512 | fine_index++; | |
475 | } | ||
476 | 995520 | return uint8_t(fine_index); | |
477 | 497760 | } | |
478 | |||
479 | 25092 | void compute_patch_median_from_histogram(size_t h, ptrdiff_t c, | |
480 | size_t total_patch_span, | ||
481 | size_t kMargin, size_t ksize, | ||
482 | Rows<uint8_t> dst) { | ||
483 | 25092 | uint16x8_t v_coarsel = vld1q_u16(H.coarse); | |
484 | 25092 | uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8); | |
485 | |||
486 | // Before starting the main patch loop to compute medians for each element, | ||
487 | // we initialize the coarse histogram buffer with the first (ksize - 1). | ||
488 | // This allows each subsequent iteration in the patch loop to perform only | ||
489 | // one addition and one subtraction. | ||
490 | 25092 | uint16_t* px = patched_coarse + 16 * total_patch_span * c; | |
491 | 25092 | initialize_coarse(px, v_coarsel, v_coarseh, kMargin); | |
492 | |||
493 |
4/4✓ Branch 0 taken 6732 times.
✓ Branch 1 taken 130560 times.
✓ Branch 2 taken 18360 times.
✓ Branch 3 taken 367200 times.
|
522852 | for (size_t patch_offset = kMargin; |
494 | 522852 | patch_offset < total_patch_span - kMargin; patch_offset++) { | |
495 | 497760 | size_t median_index = (ksize * ksize) / 2, cdf = 0; | |
496 | |||
497 | 995520 | px = patched_coarse + | |
498 | 995520 | 16 * (total_patch_span * c + | |
499 | 497760 | std::min(patch_offset + kMargin, total_patch_span - 1)); | |
500 | 497760 | increment_coarse(px, v_coarsel, v_coarseh); | |
501 | |||
502 | // Find median at coarse level | ||
503 | 497760 | size_t coarse_index = find_coarse_index(cdf, median_index); | |
504 | |||
505 | 497760 | uint16x8_t v_finel; | |
506 | 497760 | uint16x8_t v_fineh; | |
507 | // Check whether the fine histogram (H.fine[coarse_index]) for the | ||
508 | // current patch position needs to be freshly initialized or can be | ||
509 | // incrementally updated. This decision hinges on the `luc` (Last Used | ||
510 | // Coordinate) table, which records the last horizontal patch offset | ||
511 | // processed for each coarse bin. | ||
512 | // | ||
513 | // The condition is true in two scenarios: | ||
514 | // | ||
515 | // 1. **First-Time Initialization**: | ||
516 | // - This is the first time we are accessing this coarse bin | ||
517 | // (`coarse_index`) at the current patch position. | ||
518 | // - We compute the full fine histogram from scratch by summing | ||
519 | // over `ksize` rows centered at the patch position. | ||
520 | // - We accumulate the results into the `v_finel` and `v_fineh` | ||
521 | // vector registers. | ||
522 | // - These vectors are then stored into `H.fine[coarse_index]`. | ||
523 | // - The `luc` table is updated to reflect the last index | ||
524 | // processed. | ||
525 | // | ||
526 | // 2. **Window Movement Causes Loss of Overlap**: | ||
527 | // - The sliding window has moved enough that it no longer | ||
528 | // sufficiently overlaps the region | ||
529 | // used to compute the previously cached fine histogram (i.e., | ||
530 | // `luc[coarse_index]` is too far behind). | ||
531 | // - We must reinitialize the fine histogram to ensure accuracy. | ||
532 | // | ||
533 | // Otherwise: | ||
534 | // | ||
535 | // - We reuse the previously computed fine histogram stored in | ||
536 | // `H.fine[coarse_index]`. | ||
537 | // - We only update it incrementally using the `update_fine()` | ||
538 | // function, which: | ||
539 | // - Adds the new values entering the window. | ||
540 | // - Subtracts the values leaving the window. | ||
541 | // - This avoids the need for a full re-scan, leveraging temporal | ||
542 | // locality between neighboring pixels. | ||
543 | // | ||
544 | // This lookup-based optimization significantly improves performance, | ||
545 | // as neighboring filter windows often overlap heavily—especially for | ||
546 | // small strides and moderate kernel sizes. | ||
547 | // | ||
548 | // After this step, the fine histogram is ready. The next phase scans | ||
549 | // `H.fine[coarse_index]` to identify the fine bin where the | ||
550 | // cumulative sum crosses the median threshold. This gives us the | ||
551 | // final median value for the output pixel. | ||
552 |
4/4✓ Branch 0 taken 12674 times.
✓ Branch 1 taken 117886 times.
✓ Branch 2 taken 33730 times.
✓ Branch 3 taken 333470 times.
|
497760 | if (luc[coarse_index] <= patch_offset - kMargin) { |
553 | 46404 | v_finel = vdupq_n_u16(0); | |
554 | 46404 | v_fineh = vdupq_n_u16(0); | |
555 | 92808 | px = patched_fine + static_cast<ptrdiff_t>(16) * | |
556 | 92808 | (static_cast<ptrdiff_t>(total_patch_span) * | |
557 | 92808 | (16 * c + coarse_index) + | |
558 | 92808 | patch_offset - kMargin); | |
559 | |||
560 | 92808 | initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, | |
561 | 46404 | kMargin, total_patch_span); | |
562 | |||
563 | 46404 | } else { | |
564 | 451356 | v_finel = vld1q_u16(H.fine[coarse_index]); | |
565 | 451356 | v_fineh = vld1q_u16(H.fine[coarse_index] + 8); | |
566 | 451356 | px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index); | |
567 | 902712 | update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset, | |
568 | 451356 | kMargin, total_patch_span); | |
569 | } | ||
570 | |||
571 | 995520 | px = patched_coarse + | |
572 | 497760 | static_cast<ptrdiff_t>(16) * | |
573 | 995520 | (total_patch_span * c + | |
574 | 497760 | std::max(patch_offset - kMargin, static_cast<size_t>(0))); | |
575 | |||
576 | 497760 | vst1q_u16(H.fine[coarse_index], v_finel); | |
577 | 497760 | vst1q_u16(H.fine[coarse_index] + 8, v_fineh); | |
578 | |||
579 | 497760 | decrement_coarse(px, v_coarsel, v_coarseh); | |
580 | |||
581 | // Find median at fine level | ||
582 | 1991040 | dst.at(static_cast<ptrdiff_t>(h), | |
583 | 1493280 | static_cast<ptrdiff_t>(patch_offset - kMargin))[c] = | |
584 | 497760 | find_median(cdf, median_index, coarse_index); | |
585 | 497760 | } | |
586 | 25092 | } | |
587 | }; | ||
588 | |||
589 | template <bool is_single_channel> | ||
590 | 105 | void median_process(Rectangle image_dimensions, Rows<const uint8_t> src_rows, | |
591 | Rows<uint8_t> dst_rows, size_t y_begin, size_t y_end, | ||
592 | size_t kernel_width, size_t kernel_height, | ||
593 | FixedBorderType border_type) { | ||
594 | 105 | size_t kMargin = (kernel_width - 1) / 2; | |
595 | 210 | MedianBlurLargeHist<is_single_channel> median_filter{src_rows.channels(), | |
596 | 105 | kMargin}; | |
597 | |||
598 | // Process left border | ||
599 | 105 | size_t starting_width = 0; | |
600 | 105 | size_t processing_left_width = kMargin; | |
601 | 105 | Point starting_left_coordinates{starting_width, y_begin}; | |
602 | 105 | Point ending_left_coordinates{starting_width + processing_left_width, y_end}; | |
603 | 105 | median_filter.process_pixels_with_horizontal_borders( | |
604 | 105 | image_dimensions, starting_left_coordinates, ending_left_coordinates, | |
605 | 105 | src_rows, dst_rows, kernel_height, border_type); | |
606 | |||
607 | // Process center region | ||
608 | 105 | starting_width = processing_left_width; | |
609 | 105 | size_t processing_center_width = image_dimensions.width() - 2 * kMargin; | |
610 | 105 | Point starting_center_coordinates{starting_width, y_begin}; | |
611 | 210 | Point ending_center_coordinates{starting_width + processing_center_width, | |
612 | 105 | y_end}; | |
613 | 105 | median_filter.process_pixels_without_horizontal_borders( | |
614 | 105 | image_dimensions, starting_center_coordinates, ending_center_coordinates, | |
615 | 105 | src_rows, dst_rows, kernel_height, border_type); | |
616 | |||
617 | // Process right border | ||
618 | 105 | starting_width = processing_left_width + processing_center_width; | |
619 | 315 | size_t processing_right_width = image_dimensions.width() - | |
620 | 210 | processing_left_width - | |
621 | 105 | processing_center_width; | |
622 | 105 | Point starting_right_coordinates{starting_width, y_begin}; | |
623 | 210 | Point ending_right_coordinates{starting_width + processing_right_width, | |
624 | 105 | y_end}; | |
625 | 105 | median_filter.process_pixels_with_horizontal_borders( | |
626 | 105 | image_dimensions, starting_right_coordinates, ending_right_coordinates, | |
627 | 105 | src_rows, dst_rows, kernel_height, border_type); | |
628 | 105 | } | |
629 | |||
630 | 105 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8( | |
631 | const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride, | ||
632 | size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
633 | size_t kernel_width, size_t kernel_height, FixedBorderType border_type) { | ||
634 | 105 | Rectangle image_dimensions{width, height}; | |
635 | 105 | Rows<const uint8_t> src_rows{src, src_stride, channels}; | |
636 | 105 | Rows<uint8_t> dst_rows{dst, dst_stride, channels}; | |
637 | |||
638 |
2/2✓ Branch 0 taken 57 times.
✓ Branch 1 taken 48 times.
|
105 | if (channels == 1) { |
639 | 114 | median_process<true>(image_dimensions, src_rows, dst_rows, y_begin, y_end, | |
640 | 57 | kernel_width, kernel_height, border_type); | |
641 | 57 | } else { | |
642 | 96 | median_process<false>(image_dimensions, src_rows, dst_rows, y_begin, y_end, | |
643 | 48 | kernel_width, kernel_height, border_type); | |
644 | } | ||
645 | |||
646 | 105 | return KLEIDICV_OK; | |
647 | 105 | } | |
648 | |||
649 | } // namespace kleidicv::neon | ||
650 |