KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_large_hist_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 383 383 100.0%
Functions: 45 45 100.0%
Branches: 94 94 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 template <bool is_single_channel>
14 class MedianBlurLargeHist {
15 public:
16 152 MedianBlurLargeHist(size_t channels, size_t kMargin)
17 304 : patched_coarse{static_cast<uint16_t*>(
18 152 malloc((16 + 256) * (patch_length + 2 * kMargin) * channels *
19 sizeof(uint16_t)))},
20 304 patched_fine{
21 152 &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]},
22 152 H{},
23 152 luc{} {}
24
25 152 ~MedianBlurLargeHist() { free(patched_coarse); }
26
27 152 void process_pixels_without_horizontal_borders(
28 Rectangle image_dimensions, Point starting_coordinates,
29 Point ending_coordinates, Rows<const uint8_t> src_rows,
30 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
31 152 const size_t kMargin = ksize / 2;
32
33
4/4
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 64 times.
304 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
34 152 w += patch_length) {
35 304 const size_t total_patch_span =
36 152 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
37
38 304 Rows<uint8_t> shifted_dst{
39 152 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
40 152 dst_rows.stride(), dst_rows.channels()};
41
42 152 clear_patched_histogram(total_patch_span * src_rows.channels());
43
44
4/4
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 64 times.
432 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
45 280 ++c) {
46 280 clear_lookup_table();
47
48 // We initialize with ksize rows to allow merging of
49 // histogram increment and decrement operations in the main loop.
50 // This extra initial load enables a single update phase and avoids
51 // splitting the logic into separate steps.
52
4/4
✓ Branch 0 taken 2072 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 192 times.
7344 for (size_t r = 0; r < ksize; ++r) {
53 14128 const ptrdiff_t valid_h =
54 14128 get_physical_index(starting_coordinates.y() + r - kMargin,
55 7064 image_dimensions.height(), border_type);
56 7064 initialize_patched_histogram_without_horizontal_borders(
57 7064 src_rows, c, valid_h, total_patch_span, w, kMargin);
58 7064 }
59 560 compute_patch_median_from_histogram(starting_coordinates.y(), c,
60 280 total_patch_span, kMargin, ksize,
61 280 shifted_dst);
62 280 }
63
64
4/4
✓ Branch 0 taken 2904 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 2656 times.
✓ Branch 3 taken 64 times.
5712 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
65 5560 ++h) {
66 11120 const ptrdiff_t valid_old_h = get_physical_index(
67 5560 h - kMargin - 1, image_dimensions.height(), border_type);
68
69 11120 const ptrdiff_t valid_new_h = get_physical_index(
70 5560 h + kMargin, image_dimensions.height(), border_type);
71
72
4/4
✓ Branch 0 taken 2904 times.
✓ Branch 1 taken 2904 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 2656 times.
16432 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
73 10872 ++c) {
74 10872 clear_lookup_table();
75
76 10872 update_patch_histogram_without_horizontal_borders(
77 10872 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
78 10872 kMargin);
79
80 21744 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
81 10872 ksize, shifted_dst);
82 10872 }
83 5560 }
84 152 }
85 152 }
86
87 304 void process_pixels_with_horizontal_borders(
88 Rectangle image_dimensions, Point starting_coordinates,
89 Point ending_coordinates, Rows<const uint8_t> src_rows,
90 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
91 304 const size_t kMargin = ksize / 2;
92
93
4/4
✓ Branch 0 taken 176 times.
✓ Branch 1 taken 176 times.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 128 times.
608 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
94 304 w += patch_length) {
95 608 const size_t total_patch_span =
96 304 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
97 608 Rows<uint8_t> shifted_dst{
98 304 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
99 304 dst_rows.stride(), dst_rows.channels()};
100
101 304 clear_patched_histogram(total_patch_span * src_rows.channels());
102
103
4/4
✓ Branch 0 taken 176 times.
✓ Branch 1 taken 176 times.
✓ Branch 2 taken 384 times.
✓ Branch 3 taken 128 times.
864 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
104 560 ++c) {
105 560 clear_lookup_table();
106 // We initialize with ksize rows to allow merging of
107 // histogram increment and decrement operations in the main loop.
108 // This extra initial load enables a single update phase and avoids
109 // splitting the logic into separate steps.
110
4/4
✓ Branch 0 taken 4144 times.
✓ Branch 1 taken 176 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 384 times.
14688 for (size_t r = 0; r < ksize; ++r) {
111 28256 const ptrdiff_t valid_h =
112 28256 get_physical_index(starting_coordinates.y() + r - kMargin,
113 14128 image_dimensions.height(), border_type);
114 14128 initialize_patched_histogram_with_horizontal_borders(
115 14128 src_rows, c, valid_h, total_patch_span, w, kMargin,
116 14128 image_dimensions.width(), border_type);
117 14128 }
118 1120 compute_patch_median_from_histogram(starting_coordinates.y(), c,
119 560 total_patch_span, kMargin, ksize,
120 560 shifted_dst);
121 560 }
122
123
4/4
✓ Branch 0 taken 5808 times.
✓ Branch 1 taken 176 times.
✓ Branch 2 taken 5312 times.
✓ Branch 3 taken 128 times.
11424 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
124 11120 ++h) {
125 22240 const ptrdiff_t valid_old_h = get_physical_index(
126 11120 h - kMargin - 1, image_dimensions.height(), border_type);
127 22240 const ptrdiff_t valid_new_h = get_physical_index(
128 11120 h + kMargin, image_dimensions.height(), border_type);
129
4/4
✓ Branch 0 taken 5808 times.
✓ Branch 1 taken 5808 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 5312 times.
32864 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
130 21744 ++c) {
131 21744 clear_lookup_table();
132
133 21744 update_patch_histogram_with_horizontal_borders(
134 21744 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
135 21744 kMargin, image_dimensions.width(), border_type);
136
137 43488 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
138 21744 ksize, shifted_dst);
139 21744 }
140 11120 }
141 304 }
142 304 }
143
144 private:
145 // Histogram Buffer Layout Explanation
146 // -----------------------------------
147 // patched_coarse:
148 // - Conceptually a 3D array:
149 // coarse[channel_idx][patch_offset][coarse_bin]
150 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
151 // - Flattened as:
152 // coarse_offset = 16 * (patch_length * channel_idx + patch_offset)
153 // + (incoming_pixel >> 4);
154 //
155 // patched_fine:
156 // - Conceptually a 4D array:
157 // fine[channel_idx][coarse_bin][patch_offset][fine_bin]
158 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
159 // - fine_bin = incoming_pixel & 0xF ∈ [0, 15]
160 // - Flattened as:
161 // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin)
162 // + patch_offset) + fine_bin;
163 //
164 // This layout enables fast linear access while preserving the hierarchical
165 // structure of histograms per channel and patch position.
166 constexpr static size_t patch_length = 512;
167 uint16_t* patched_coarse;
168 uint16_t* patched_fine;
169
170 // Clear only the portion of the patched histogram buffers that will be used
171 // for the current patch.
172 // Since these buffers are large, there's no need to zero out the entire
173 // allocation— only the section relevant to the current total patch size is
174 // cleared for efficiency.
175 456 void clear_patched_histogram(size_t total_patch_size) {
176 912 std::memset(patched_coarse, 0,
177 456 16 * total_patch_size * sizeof(patched_coarse[0]));
178 912 std::memset(patched_fine, 0,
179 456 256 * total_patch_size * sizeof(patched_fine[0]));
180 456 }
181
182 903744 void scalar_initialize_patched_histogram(int incoming_pixel,
183 size_t channel_idx,
184 size_t patch_length,
185 size_t patch_offset) {
186 1807488 const size_t coarse_offset =
187 1807488 16 * (patch_length * channel_idx + patch_offset) +
188 903744 (incoming_pixel >> 4);
189 1807488 const size_t fine_offset =
190 1807488 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) +
191 1807488 patch_offset) +
192 903744 (incoming_pixel & 0xF);
193 903744 patched_coarse[coarse_offset]++;
194 903744 patched_fine[fine_offset]++;
195 903744 }
196
197 5808 void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels,
198 size_t channel_idx,
199 size_t patch_length,
200 size_t patch_offset) {
201 KLEIDICV_FORCE_LOOP_UNROLL
202
2/2
✓ Branch 0 taken 5808 times.
✓ Branch 1 taken 92928 times.
98736 for (int i = 0; i < 16; i++) {
203 185856 const size_t coarse_offset_incoming =
204 185856 16 * (patch_length * channel_idx + patch_offset + i) +
205 92928 (incoming_pixels[i] >> 4);
206
207 185856 const size_t fine_offset_incoming =
208 278784 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
209 278784 patch_offset + i) +
210 92928 (incoming_pixels[i] & 0xF);
211
212 92928 patched_coarse[coarse_offset_incoming]++;
213 92928 patched_fine[fine_offset_incoming]++;
214 92928 }
215 5808 }
216
217 7064 void initialize_patched_histogram_without_horizontal_borders(
218 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
219 size_t total_patch_span, size_t starting_width, size_t kMargin) {
220 7064 size_t vector_part = 0;
221 if constexpr (is_single_channel) {
222 2072 vector_part = (total_patch_span >> 4) << 4;
223
2/2
✓ Branch 0 taken 5808 times.
✓ Branch 1 taken 2072 times.
7880 for (size_t patch_offset = 0; patch_offset < vector_part;
224 5808 patch_offset += 16) {
225 11616 const ptrdiff_t valid_w =
226 5808 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
227 5808 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]);
228 11616 vector_initialize_patched_histogram(incoming_pixels, c,
229 5808 total_patch_span, patch_offset);
230 5808 }
231 }
232
233
4/4
✓ Branch 0 taken 2072 times.
✓ Branch 1 taken 23232 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 299520 times.
329816 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
234 322752 ++patch_offset) {
235 645504 const ptrdiff_t valid_w =
236 322752 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
237 322752 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
238
239 645504 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
240 322752 patch_offset);
241 322752 }
242 7064 }
243
244 14128 void initialize_patched_histogram_with_horizontal_borders(
245 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
246 size_t total_patch_span, size_t starting_width, size_t kMargin,
247 size_t width, FixedBorderType border_type) {
248
4/4
✓ Branch 0 taken 4144 times.
✓ Branch 1 taken 159936 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 421056 times.
595120 for (size_t patch_offset = 0; patch_offset < total_patch_span;
249 580992 ++patch_offset) {
250 1161984 ptrdiff_t valid_w = get_physical_index(
251 580992 starting_width + patch_offset - kMargin, width, border_type);
252
253 580992 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
254
255 1161984 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
256 580992 patch_offset);
257 580992 }
258 14128 }
259
260 // During vertical traversal (the main 'height' loop), each sliding window
261 // iteration introduces a new incoming row and removes an outgoing one. The
262 // histogram must be updated accordingly by subtracting the contributions of
263 // the outgoing row and adding those of the incoming row.
264 // Both increment and decrement operations are handled inside the same
265 // function.
266 1351744 void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel,
267 size_t channel_idx, size_t patch_length,
268 size_t patch_offset) {
269 2703488 const size_t coarse_offset_base =
270 1351744 16 * (patch_length * channel_idx + patch_offset);
271 2703488 const size_t fine_offset_base =
272 1351744 16 * patch_length * 16 * channel_idx + 16 * patch_offset;
273
274 1351744 const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4);
275 1351744 const size_t mask_new_pixel = (incoming_pixel & 0xF);
276 1351744 const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4);
277 1351744 const size_t mask_old_pixel = (outgoing_pixel & 0xF);
278
279 2703488 const size_t fine_new_offset = fine_offset_base + mask_new_pixel +
280 1351744 16 * patch_length * pixel_new_shift_right_4;
281 2703488 const size_t coarse_new_offset =
282 1351744 coarse_offset_base + pixel_new_shift_right_4;
283
284 2703488 const size_t fine_old_offset = fine_offset_base + mask_old_pixel +
285 1351744 16 * patch_length * pixel_old_shift_right_4;
286 2703488 const size_t coarse_old_offset =
287 1351744 coarse_offset_base + pixel_old_shift_right_4;
288
289 1351744 patched_coarse[coarse_new_offset]++;
290 1351744 patched_coarse[coarse_old_offset]--;
291 1351744 patched_fine[fine_new_offset]++;
292 1351744 patched_fine[fine_old_offset]--;
293 1351744 }
294
295 8464 void vector_update_patch_histogram(uint8x16_t& outgoing_pixels,
296 uint8x16_t& incoming_pixels,
297 size_t channel_idx, size_t patch_length,
298 size_t patch_offset) {
299 KLEIDICV_FORCE_LOOP_UNROLL
300
2/2
✓ Branch 0 taken 8464 times.
✓ Branch 1 taken 135424 times.
143888 for (int i = 0; i < 16; i++) {
301 270848 const size_t coarse_offset_incoming =
302 270848 16 * (patch_length * channel_idx + patch_offset + i) +
303 135424 (incoming_pixels[i] >> 4);
304
305 270848 const size_t coarse_offset_outgoing =
306 270848 16 * (patch_length * channel_idx + patch_offset + i) +
307 135424 (outgoing_pixels[i] >> 4);
308
309 270848 const size_t fine_offset_incoming =
310 406272 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
311 406272 patch_offset + i) +
312 135424 (incoming_pixels[i] & 0xF);
313
314 270848 const size_t fine_offset_outgoing =
315 406272 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) +
316 406272 patch_offset + i) +
317 135424 (outgoing_pixels[i] & 0xF);
318
319 135424 patched_coarse[coarse_offset_incoming]++;
320 135424 patched_coarse[coarse_offset_outgoing]--;
321 135424 patched_fine[fine_offset_incoming]++;
322 135424 patched_fine[fine_offset_outgoing]--;
323 135424 }
324 8464 }
325
326 10872 void update_patch_histogram_without_horizontal_borders(
327 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
328 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
329 size_t kMargin) {
330 10872 size_t vector_part = 0;
331 if constexpr (is_single_channel) {
332 2904 vector_part = (total_patch_span >> 4) << 4;
333
2/2
✓ Branch 0 taken 8464 times.
✓ Branch 1 taken 2904 times.
11368 for (size_t patch_offset = 0; patch_offset < vector_part;
334 8464 patch_offset += 16) {
335 16928 const ptrdiff_t valid_w =
336 8464 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
337 8464 auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]);
338 8464 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]);
339 16928 vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c,
340 8464 total_patch_span, patch_offset);
341 8464 }
342 }
343
344
4/4
✓ Branch 0 taken 2904 times.
✓ Branch 1 taken 33856 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 478080 times.
522808 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
345 511936 ++patch_offset) {
346 1023872 const ptrdiff_t valid_w =
347 511936 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
348 511936 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
349 511936 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
350 1023872 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
351 511936 total_patch_span, patch_offset);
352 511936 }
353 10872 }
354
355 21744 void update_patch_histogram_with_horizontal_borders(
356 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
357 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
358 size_t kMargin, size_t width, FixedBorderType border_type) {
359
4/4
✓ Branch 0 taken 5808 times.
✓ Branch 1 taken 218880 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 620928 times.
861552 for (size_t patch_offset = 0; patch_offset < total_patch_span;
360 839808 ++patch_offset) {
361 1679616 const ptrdiff_t valid_w = get_physical_index(
362 839808 starting_width + patch_offset - kMargin, width, border_type);
363 839808 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
364 839808 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
365 1679616 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
366 839808 total_patch_span, patch_offset);
367 839808 }
368 21744 }
369
370 // `H` and `luc` are used for computing the median value for a single
371 // output pixel:
372 // - `H` is a histogram structure holding both the coarse and fine
373 // bins needed
374 // for the current element's median calculation.
375 // - `luc` is a lookup table that stores the last processed offset
376 // (index)
377 // for each coarse bin. This allows incremental fine histogram
378 // updates instead of full recalculation when histogram overlap is
379 // high. Since neighboring patches in natural images often have
380 // similar pixel values, reusing previous histogram state can
381 // significantly reduce processing time.
382 typedef struct {
383 uint16_t coarse[16];
384 uint16_t fine[16][16];
385 } Histogram;
386
387 Histogram H;
388 uint16_t luc[16];
389
390 33456 void clear_lookup_table(void) {
391 33456 std::memset(&H, 0, sizeof(Histogram));
392 33456 std::memset(luc, 0, 16 * sizeof(uint16_t));
393 33456 }
394
395 33456 void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel,
396 uint16x8_t& v_coarseh, size_t kMargin) {
397
4/4
✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 224832 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 635328 times.
893616 for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) {
398 860160 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
399 860160 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
400 860160 }
401 33456 }
402
403 663680 void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel,
404 uint16x8_t& v_coarseh) {
405 663680 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
406 663680 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
407 663680 vst1q_u16(H.coarse, v_coarsel);
408 663680 vst1q_u16(H.coarse + 8, v_coarseh);
409 663680 }
410
411 663680 void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel,
412 uint16x8_t& v_coarseh) {
413 663680 v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px));
414 663680 v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8));
415 663680 }
416
417 60314 void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
418 uint16_t& luc, size_t patch_offset, size_t kMargin,
419 size_t total_patch_span) {
420
4/4
✓ Branch 0 taken 399622 times.
✓ Branch 1 taken 16106 times.
✓ Branch 2 taken 1141110 times.
✓ Branch 3 taken 44208 times.
1601046 for (luc = static_cast<uint16_t>(patch_offset - kMargin);
421 3202092 luc < static_cast<uint16_t>(
422 1601046 std::min(patch_offset + kMargin + 1, total_patch_span));
423 1540732 ++luc, px += 16) {
424 1540732 v_finel = vaddq_u16(v_finel, vld1q_u16(px));
425 1540732 v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8));
426 1540732 }
427 60314 }
428
429 603366 void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
430 uint16_t& luc, size_t patch_offset, size_t kMargin,
431 size_t total_patch_span) {
432
4/4
✓ Branch 0 taken 190060 times.
✓ Branch 1 taken 157974 times.
✓ Branch 2 taken 531569 times.
✓ Branch 3 taken 445392 times.
1324995 for (; luc < static_cast<uint16_t>(patch_offset + kMargin + 1); ++luc) {
433 721629 constexpr ptrdiff_t stride = 16;
434 1443258 const ptrdiff_t patch_span_limit =
435 721629 static_cast<ptrdiff_t>(total_patch_span - 1);
436 721629 const ptrdiff_t safe_luc = static_cast<ptrdiff_t>(luc);
437
438 1443258 const ptrdiff_t base_offset =
439 721629 stride * std::min(safe_luc, patch_span_limit);
440 1443258 const ptrdiff_t old_offset =
441 1443258 stride * std::max(safe_luc - static_cast<ptrdiff_t>(2 * kMargin + 1),
442 721629 ptrdiff_t{0});
443 721629 const uint16x8_t new_vecl = vld1q_u16(px + base_offset);
444 721629 const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8);
445 721629 const uint16x8_t old_vecl = vld1q_u16(px + old_offset);
446 721629 const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8);
447 721629 v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl);
448 721629 v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech);
449 721629 }
450 603366 }
451
452 663680 size_t find_coarse_index(size_t& cdf, size_t median_index) {
453 663680 size_t coarse_index = 0;
454 5675833 while (true) {
455 5675833 cdf += H.coarse[coarse_index];
456
4/4
✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1314003 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3698150 times.
5675833 if (cdf > median_index) {
457 663680 cdf -= H.coarse[coarse_index];
458 663680 break;
459 }
460 5012153 coarse_index++;
461 }
462 1327360 return coarse_index;
463 663680 }
464
465 663680 uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) {
466 663680 uint16_t* segment = H.fine[coarse_index];
467 663680 size_t fine_index = 0;
468 5294917 while (true) {
469 5294917 cdf += segment[fine_index];
470
4/4
✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1207491 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3423746 times.
5294917 if (cdf > median_index) {
471 663680 fine_index = (16 * coarse_index + fine_index);
472 663680 break;
473 }
474 4631237 fine_index++;
475 }
476 1327360 return uint8_t(fine_index);
477 663680 }
478
479 33456 void compute_patch_median_from_histogram(size_t h, ptrdiff_t c,
480 size_t total_patch_span,
481 size_t kMargin, size_t ksize,
482 Rows<uint8_t> dst) {
483 33456 uint16x8_t v_coarsel = vld1q_u16(H.coarse);
484 33456 uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8);
485
486 // Before starting the main patch loop to compute medians for each element,
487 // we initialize the coarse histogram buffer with the first (ksize - 1).
488 // This allows each subsequent iteration in the patch loop to perform only
489 // one addition and one subtraction.
490 33456 uint16_t* px = patched_coarse + 16 * total_patch_span * c;
491 33456 initialize_coarse(px, v_coarsel, v_coarseh, kMargin);
492
493
4/4
✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 174080 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 489600 times.
697136 for (size_t patch_offset = kMargin;
494 697136 patch_offset < total_patch_span - kMargin; patch_offset++) {
495 663680 size_t median_index = (ksize * ksize) / 2, cdf = 0;
496
497 1327360 px = patched_coarse +
498 1327360 16 * (total_patch_span * c +
499 663680 std::min(patch_offset + kMargin, total_patch_span - 1));
500 663680 increment_coarse(px, v_coarsel, v_coarseh);
501
502 // Find median at coarse level
503 663680 size_t coarse_index = find_coarse_index(cdf, median_index);
504
505 663680 uint16x8_t v_finel;
506 663680 uint16x8_t v_fineh;
507 // Check whether the fine histogram (H.fine[coarse_index]) for the
508 // current patch position needs to be freshly initialized or can be
509 // incrementally updated. This decision hinges on the `luc` (Last Used
510 // Coordinate) table, which records the last horizontal patch offset
511 // processed for each coarse bin.
512 //
513 // The condition is true in two scenarios:
514 //
515 // 1. **First-Time Initialization**:
516 // - This is the first time we are accessing this coarse bin
517 // (`coarse_index`) at the current patch position.
518 // - We compute the full fine histogram from scratch by summing
519 // over `ksize` rows centered at the patch position.
520 // - We accumulate the results into the `v_finel` and `v_fineh`
521 // vector registers.
522 // - These vectors are then stored into `H.fine[coarse_index]`.
523 // - The `luc` table is updated to reflect the last index
524 // processed.
525 //
526 // 2. **Window Movement Causes Loss of Overlap**:
527 // - The sliding window has moved enough that it no longer
528 // sufficiently overlaps the region
529 // used to compute the previously cached fine histogram (i.e.,
530 // `luc[coarse_index]` is too far behind).
531 // - We must reinitialize the fine histogram to ensure accuracy.
532 //
533 // Otherwise:
534 //
535 // - We reuse the previously computed fine histogram stored in
536 // `H.fine[coarse_index]`.
537 // - We only update it incrementally using the `update_fine()`
538 // function, which:
539 // - Adds the new values entering the window.
540 // - Subtracts the values leaving the window.
541 // - This avoids the need for a full re-scan, leveraging temporal
542 // locality between neighboring pixels.
543 //
544 // This lookup-based optimization significantly improves performance,
545 // as neighboring filter windows often overlap heavily—especially for
546 // small strides and moderate kernel sizes.
547 //
548 // After this step, the fine histogram is ready. The next phase scans
549 // `H.fine[coarse_index]` to identify the fine bin where the
550 // cumulative sum crosses the median threshold. This gives us the
551 // final median value for the output pixel.
552
4/4
✓ Branch 0 taken 16106 times.
✓ Branch 1 taken 157974 times.
✓ Branch 2 taken 44208 times.
✓ Branch 3 taken 445392 times.
663680 if (luc[coarse_index] <= patch_offset - kMargin) {
553 60314 v_finel = vdupq_n_u16(0);
554 60314 v_fineh = vdupq_n_u16(0);
555 120628 px = patched_fine + static_cast<ptrdiff_t>(16) *
556 120628 (static_cast<ptrdiff_t>(total_patch_span) *
557 120628 (16 * c + coarse_index) +
558 120628 patch_offset - kMargin);
559
560 120628 initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
561 60314 kMargin, total_patch_span);
562
563 60314 } else {
564 603366 v_finel = vld1q_u16(H.fine[coarse_index]);
565 603366 v_fineh = vld1q_u16(H.fine[coarse_index] + 8);
566 603366 px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index);
567 1206732 update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
568 603366 kMargin, total_patch_span);
569 }
570
571 1327360 px = patched_coarse +
572 663680 static_cast<ptrdiff_t>(16) *
573 1327360 (total_patch_span * c +
574 663680 std::max(patch_offset - kMargin, static_cast<size_t>(0)));
575
576 663680 vst1q_u16(H.fine[coarse_index], v_finel);
577 663680 vst1q_u16(H.fine[coarse_index] + 8, v_fineh);
578
579 663680 decrement_coarse(px, v_coarsel, v_coarseh);
580
581 // Find median at fine level
582 2654720 dst.at(static_cast<ptrdiff_t>(h),
583 1991040 static_cast<ptrdiff_t>(patch_offset - kMargin))[c] =
584 663680 find_median(cdf, median_index, coarse_index);
585 663680 }
586 33456 }
587 };
588
589 template <bool is_single_channel>
590 152 void median_process(Rectangle image_dimensions, Rows<const uint8_t> src_rows,
591 Rows<uint8_t> dst_rows, size_t y_begin, size_t y_end,
592 size_t kernel_width, size_t kernel_height,
593 FixedBorderType border_type) {
594 152 size_t kMargin = (kernel_width - 1) / 2;
595 304 MedianBlurLargeHist<is_single_channel> median_filter{src_rows.channels(),
596 152 kMargin};
597
598 // Process left border
599 152 size_t starting_width = 0;
600 152 size_t processing_left_width = kMargin;
601 152 Point starting_left_coordinates{starting_width, y_begin};
602 152 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
603 152 median_filter.process_pixels_with_horizontal_borders(
604 152 image_dimensions, starting_left_coordinates, ending_left_coordinates,
605 152 src_rows, dst_rows, kernel_height, border_type);
606
607 // Process center region
608 152 starting_width = processing_left_width;
609 152 size_t processing_center_width = image_dimensions.width() - 2 * kMargin;
610 152 Point starting_center_coordinates{starting_width, y_begin};
611 304 Point ending_center_coordinates{starting_width + processing_center_width,
612 152 y_end};
613 152 median_filter.process_pixels_without_horizontal_borders(
614 152 image_dimensions, starting_center_coordinates, ending_center_coordinates,
615 152 src_rows, dst_rows, kernel_height, border_type);
616
617 // Process right border
618 152 starting_width = processing_left_width + processing_center_width;
619 456 size_t processing_right_width = image_dimensions.width() -
620 304 processing_left_width -
621 152 processing_center_width;
622 152 Point starting_right_coordinates{starting_width, y_begin};
623 304 Point ending_right_coordinates{starting_width + processing_right_width,
624 152 y_end};
625 152 median_filter.process_pixels_with_horizontal_borders(
626 152 image_dimensions, starting_right_coordinates, ending_right_coordinates,
627 152 src_rows, dst_rows, kernel_height, border_type);
628 152 }
629
630 152 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8(
631 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
632 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
633 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
634 152 Rectangle image_dimensions{width, height};
635 152 Rows<const uint8_t> src_rows{src, src_stride, channels};
636 152 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
637
638
2/2
✓ Branch 0 taken 88 times.
✓ Branch 1 taken 64 times.
152 if (channels == 1) {
639 176 median_process<true>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
640 88 kernel_width, kernel_height, border_type);
641 88 } else {
642 128 median_process<false>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
643 64 kernel_width, kernel_height, border_type);
644 }
645
646 152 return KLEIDICV_OK;
647 152 }
648
649 } // namespace kleidicv::neon
650