KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_large_hist_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 383 383 100.0%
Functions: 45 45 100.0%
Branches: 94 94 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 template <bool is_single_channel>
14 class MedianBlurLargeHist {
15 public:
16 105 MedianBlurLargeHist(size_t channels, size_t kMargin)
17 210 : patched_coarse{static_cast<uint16_t*>(
18 105 malloc((16 + 256) * (patch_length + 2 * kMargin) * channels *
19 sizeof(uint16_t)))},
20 210 patched_fine{
21 105 &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]},
22 105 H{},
23 105 luc{} {}
24
25 105 ~MedianBlurLargeHist() { free(patched_coarse); }
26
27 105 void process_pixels_without_horizontal_borders(
28 Rectangle image_dimensions, Point starting_coordinates,
29 Point ending_coordinates, Rows<const uint8_t> src_rows,
30 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
31 105 const size_t kMargin = ksize / 2;
32
33
4/4
✓ Branch 0 taken 57 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 48 times.
210 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
34 105 w += patch_length) {
35 210 const size_t total_patch_span =
36 105 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
37
38 210 Rows<uint8_t> shifted_dst{
39 105 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
40 105 dst_rows.stride(), dst_rows.channels()};
41
42 105 clear_patched_histogram(total_patch_span * src_rows.channels());
43
44
4/4
✓ Branch 0 taken 57 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 144 times.
✓ Branch 3 taken 48 times.
306 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
45 201 ++c) {
46 201 clear_lookup_table();
47
48 // We initialize with ksize rows to allow merging of
49 // histogram increment and decrement operations in the main loop.
50 // This extra initial load enables a single update phase and avoids
51 // splitting the logic into separate steps.
52
4/4
✓ Branch 0 taken 1401 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 3744 times.
✓ Branch 3 taken 144 times.
5346 for (size_t r = 0; r < ksize; ++r) {
53 10290 const ptrdiff_t valid_h =
54 10290 get_physical_index(starting_coordinates.y() + r - kMargin,
55 5145 image_dimensions.height(), border_type);
56 5145 initialize_patched_histogram_without_horizontal_borders(
57 5145 src_rows, c, valid_h, total_patch_span, w, kMargin);
58 5145 }
59 402 compute_patch_median_from_histogram(starting_coordinates.y(), c,
60 201 total_patch_span, kMargin, ksize,
61 201 shifted_dst);
62 201 }
63
64
4/4
✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 57 times.
✓ Branch 2 taken 1992 times.
✓ Branch 3 taken 48 times.
4284 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
65 4179 ++h) {
66 8358 const ptrdiff_t valid_old_h = get_physical_index(
67 4179 h - kMargin - 1, image_dimensions.height(), border_type);
68
69 8358 const ptrdiff_t valid_new_h = get_physical_index(
70 4179 h + kMargin, image_dimensions.height(), border_type);
71
72
4/4
✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 2187 times.
✓ Branch 2 taken 5976 times.
✓ Branch 3 taken 1992 times.
12342 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
73 8163 ++c) {
74 8163 clear_lookup_table();
75
76 8163 update_patch_histogram_without_horizontal_borders(
77 8163 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
78 8163 kMargin);
79
80 16326 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
81 8163 ksize, shifted_dst);
82 8163 }
83 4179 }
84 105 }
85 105 }
86
87 210 void process_pixels_with_horizontal_borders(
88 Rectangle image_dimensions, Point starting_coordinates,
89 Point ending_coordinates, Rows<const uint8_t> src_rows,
90 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
91 210 const size_t kMargin = ksize / 2;
92
93
4/4
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 96 times.
✓ Branch 3 taken 96 times.
420 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
94 210 w += patch_length) {
95 420 const size_t total_patch_span =
96 210 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
97 420 Rows<uint8_t> shifted_dst{
98 210 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
99 210 dst_rows.stride(), dst_rows.channels()};
100
101 210 clear_patched_histogram(total_patch_span * src_rows.channels());
102
103
4/4
✓ Branch 0 taken 114 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 96 times.
612 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
104 402 ++c) {
105 402 clear_lookup_table();
106 // We initialize with ksize rows to allow merging of
107 // histogram increment and decrement operations in the main loop.
108 // This extra initial load enables a single update phase and avoids
109 // splitting the logic into separate steps.
110
4/4
✓ Branch 0 taken 2802 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 7488 times.
✓ Branch 3 taken 288 times.
10692 for (size_t r = 0; r < ksize; ++r) {
111 20580 const ptrdiff_t valid_h =
112 20580 get_physical_index(starting_coordinates.y() + r - kMargin,
113 10290 image_dimensions.height(), border_type);
114 10290 initialize_patched_histogram_with_horizontal_borders(
115 10290 src_rows, c, valid_h, total_patch_span, w, kMargin,
116 10290 image_dimensions.width(), border_type);
117 10290 }
118 804 compute_patch_median_from_histogram(starting_coordinates.y(), c,
119 402 total_patch_span, kMargin, ksize,
120 402 shifted_dst);
121 402 }
122
123
4/4
✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 114 times.
✓ Branch 2 taken 3984 times.
✓ Branch 3 taken 96 times.
8568 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
124 8358 ++h) {
125 16716 const ptrdiff_t valid_old_h = get_physical_index(
126 8358 h - kMargin - 1, image_dimensions.height(), border_type);
127 16716 const ptrdiff_t valid_new_h = get_physical_index(
128 8358 h + kMargin, image_dimensions.height(), border_type);
129
4/4
✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 4374 times.
✓ Branch 2 taken 11952 times.
✓ Branch 3 taken 3984 times.
24684 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
130 16326 ++c) {
131 16326 clear_lookup_table();
132
133 16326 update_patch_histogram_with_horizontal_borders(
134 16326 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
135 16326 kMargin, image_dimensions.width(), border_type);
136
137 32652 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
138 16326 ksize, shifted_dst);
139 16326 }
140 8358 }
141 210 }
142 210 }
143
144 private:
145 // Histogram Buffer Layout Explanation
146 // -----------------------------------
147 // patched_coarse:
148 // - Conceptually a 3D array:
149 // coarse[channel_idx][patch_offset][coarse_bin]
150 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
151 // - Flattened as:
152 // coarse_offset = 16 * (patch_length * channel_idx + patch_offset)
153 // + (incoming_pixel >> 4);
154 //
155 // patched_fine:
156 // - Conceptually a 4D array:
157 // fine[channel_idx][coarse_bin][patch_offset][fine_bin]
158 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
159 // - fine_bin = incoming_pixel & 0xF ∈ [0, 15]
160 // - Flattened as:
161 // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin)
162 // + patch_offset) + fine_bin;
163 //
164 // This layout enables fast linear access while preserving the hierarchical
165 // structure of histograms per channel and patch position.
166 constexpr static size_t patch_length = 512;
167 uint16_t* patched_coarse;
168 uint16_t* patched_fine;
169
170 // Clear only the portion of the patched histogram buffers that will be used
171 // for the current patch.
172 // Since these buffers are large, there's no need to zero out the entire
173 // allocation— only the section relevant to the current total patch size is
174 // cleared for efficiency.
175 315 void clear_patched_histogram(size_t total_patch_size) {
176 630 std::memset(patched_coarse, 0,
177 315 16 * total_patch_size * sizeof(patched_coarse[0]));
178 630 std::memset(patched_fine, 0,
179 315 256 * total_patch_size * sizeof(patched_fine[0]));
180 315 }
181
182 669240 void scalar_initialize_patched_histogram(int incoming_pixel,
183 size_t channel_idx,
184 size_t patch_length,
185 size_t patch_offset) {
186 1338480 const size_t coarse_offset =
187 1338480 16 * (patch_length * channel_idx + patch_offset) +
188 669240 (incoming_pixel >> 4);
189 1338480 const size_t fine_offset =
190 1338480 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) +
191 1338480 patch_offset) +
192 669240 (incoming_pixel & 0xF);
193 669240 patched_coarse[coarse_offset]++;
194 669240 patched_fine[fine_offset]++;
195 669240 }
196
197 4050 void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels,
198 size_t channel_idx,
199 size_t patch_length,
200 size_t patch_offset) {
201 KLEIDICV_FORCE_LOOP_UNROLL
202
2/2
✓ Branch 0 taken 4050 times.
✓ Branch 1 taken 64800 times.
68850 for (int i = 0; i < 16; i++) {
203 129600 const size_t coarse_offset_incoming =
204 129600 16 * (patch_length * channel_idx + patch_offset + i) +
205 64800 (incoming_pixels[i] >> 4);
206
207 129600 const size_t fine_offset_incoming =
208 194400 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
209 194400 patch_offset + i) +
210 64800 (incoming_pixels[i] & 0xF);
211
212 64800 patched_coarse[coarse_offset_incoming]++;
213 64800 patched_fine[fine_offset_incoming]++;
214 64800 }
215 4050 }
216
217 5145 void initialize_patched_histogram_without_horizontal_borders(
218 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
219 size_t total_patch_span, size_t starting_width, size_t kMargin) {
220 5145 size_t vector_part = 0;
221 if constexpr (is_single_channel) {
222 1401 vector_part = (total_patch_span >> 4) << 4;
223
2/2
✓ Branch 0 taken 4050 times.
✓ Branch 1 taken 1401 times.
5451 for (size_t patch_offset = 0; patch_offset < vector_part;
224 4050 patch_offset += 16) {
225 8100 const ptrdiff_t valid_w =
226 4050 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
227 4050 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]);
228 8100 vector_initialize_patched_histogram(incoming_pixels, c,
229 4050 total_patch_span, patch_offset);
230 4050 }
231 }
232
233
4/4
✓ Branch 0 taken 1401 times.
✓ Branch 1 taken 16200 times.
✓ Branch 2 taken 3744 times.
✓ Branch 3 taken 224640 times.
245985 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
234 240840 ++patch_offset) {
235 481680 const ptrdiff_t valid_w =
236 240840 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
237 240840 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
238
239 481680 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
240 240840 patch_offset);
241 240840 }
242 5145 }
243
244 10290 void initialize_patched_histogram_with_horizontal_borders(
245 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
246 size_t total_patch_span, size_t starting_width, size_t kMargin,
247 size_t width, FixedBorderType border_type) {
248
4/4
✓ Branch 0 taken 2802 times.
✓ Branch 1 taken 112608 times.
✓ Branch 2 taken 7488 times.
✓ Branch 3 taken 315792 times.
438690 for (size_t patch_offset = 0; patch_offset < total_patch_span;
249 428400 ++patch_offset) {
250 856800 ptrdiff_t valid_w = get_physical_index(
251 428400 starting_width + patch_offset - kMargin, width, border_type);
252
253 428400 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
254
255 856800 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
256 428400 patch_offset);
257 428400 }
258 10290 }
259
260 // During vertical traversal (the main 'height' loop), each sliding window
261 // iteration introduces a new incoming row and removes an outgoing one. The
262 // histogram must be updated accordingly by subtracting the contributions of
263 // the outgoing row and adding those of the incoming row.
264 // Both increment and decrement operations are handled inside the same
265 // function.
266 1014312 void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel,
267 size_t channel_idx, size_t patch_length,
268 size_t patch_offset) {
269 2028624 const size_t coarse_offset_base =
270 1014312 16 * (patch_length * channel_idx + patch_offset);
271 2028624 const size_t fine_offset_base =
272 1014312 16 * patch_length * 16 * channel_idx + 16 * patch_offset;
273
274 1014312 const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4);
275 1014312 const size_t mask_new_pixel = (incoming_pixel & 0xF);
276 1014312 const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4);
277 1014312 const size_t mask_old_pixel = (outgoing_pixel & 0xF);
278
279 2028624 const size_t fine_new_offset = fine_offset_base + mask_new_pixel +
280 1014312 16 * patch_length * pixel_new_shift_right_4;
281 2028624 const size_t coarse_new_offset =
282 1014312 coarse_offset_base + pixel_new_shift_right_4;
283
284 2028624 const size_t fine_old_offset = fine_offset_base + mask_old_pixel +
285 1014312 16 * patch_length * pixel_old_shift_right_4;
286 2028624 const size_t coarse_old_offset =
287 1014312 coarse_offset_base + pixel_old_shift_right_4;
288
289 1014312 patched_coarse[coarse_new_offset]++;
290 1014312 patched_coarse[coarse_old_offset]--;
291 1014312 patched_fine[fine_new_offset]++;
292 1014312 patched_fine[fine_old_offset]--;
293 1014312 }
294
295 6366 void vector_update_patch_histogram(uint8x16_t& outgoing_pixels,
296 uint8x16_t& incoming_pixels,
297 size_t channel_idx, size_t patch_length,
298 size_t patch_offset) {
299 KLEIDICV_FORCE_LOOP_UNROLL
300
2/2
✓ Branch 0 taken 6366 times.
✓ Branch 1 taken 101856 times.
108222 for (int i = 0; i < 16; i++) {
301 203712 const size_t coarse_offset_incoming =
302 203712 16 * (patch_length * channel_idx + patch_offset + i) +
303 101856 (incoming_pixels[i] >> 4);
304
305 203712 const size_t coarse_offset_outgoing =
306 203712 16 * (patch_length * channel_idx + patch_offset + i) +
307 101856 (outgoing_pixels[i] >> 4);
308
309 203712 const size_t fine_offset_incoming =
310 305568 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
311 305568 patch_offset + i) +
312 101856 (incoming_pixels[i] & 0xF);
313
314 203712 const size_t fine_offset_outgoing =
315 305568 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) +
316 305568 patch_offset + i) +
317 101856 (outgoing_pixels[i] & 0xF);
318
319 101856 patched_coarse[coarse_offset_incoming]++;
320 101856 patched_coarse[coarse_offset_outgoing]--;
321 101856 patched_fine[fine_offset_incoming]++;
322 101856 patched_fine[fine_offset_outgoing]--;
323 101856 }
324 6366 }
325
326 8163 void update_patch_histogram_without_horizontal_borders(
327 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
328 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
329 size_t kMargin) {
330 8163 size_t vector_part = 0;
331 if constexpr (is_single_channel) {
332 2187 vector_part = (total_patch_span >> 4) << 4;
333
2/2
✓ Branch 0 taken 6366 times.
✓ Branch 1 taken 2187 times.
8553 for (size_t patch_offset = 0; patch_offset < vector_part;
334 6366 patch_offset += 16) {
335 12732 const ptrdiff_t valid_w =
336 6366 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
337 6366 auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]);
338 6366 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]);
339 12732 vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c,
340 6366 total_patch_span, patch_offset);
341 6366 }
342 }
343
344
4/4
✓ Branch 0 taken 2187 times.
✓ Branch 1 taken 25464 times.
✓ Branch 2 taken 5976 times.
✓ Branch 3 taken 358560 times.
392187 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
345 384024 ++patch_offset) {
346 768048 const ptrdiff_t valid_w =
347 384024 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
348 384024 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
349 384024 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
350 768048 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
351 384024 total_patch_span, patch_offset);
352 384024 }
353 8163 }
354
355 16326 void update_patch_histogram_with_horizontal_borders(
356 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
357 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
358 size_t kMargin, size_t width, FixedBorderType border_type) {
359
4/4
✓ Branch 0 taken 4374 times.
✓ Branch 1 taken 164592 times.
✓ Branch 2 taken 11952 times.
✓ Branch 3 taken 465696 times.
646614 for (size_t patch_offset = 0; patch_offset < total_patch_span;
360 630288 ++patch_offset) {
361 1260576 const ptrdiff_t valid_w = get_physical_index(
362 630288 starting_width + patch_offset - kMargin, width, border_type);
363 630288 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
364 630288 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
365 1260576 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
366 630288 total_patch_span, patch_offset);
367 630288 }
368 16326 }
369
370 // `H` and `luc` are used for computing the median value for a single
371 // output pixel:
372 // - `H` is a histogram structure holding both the coarse and fine
373 // bins needed
374 // for the current element's median calculation.
375 // - `luc` is a lookup table that stores the last processed offset
376 // (index)
377 // for each coarse bin. This allows incremental fine histogram
378 // updates instead of full recalculation when histogram overlap is
379 // high. Since neighboring patches in natural images often have
380 // similar pixel values, reusing previous histogram state can
381 // significantly reduce processing time.
382 typedef struct {
383 uint16_t coarse[16];
384 uint16_t fine[16][16];
385 } Histogram;
386
387 Histogram H;
388 uint16_t luc[16];
389
390 25092 void clear_lookup_table(void) {
391 25092 std::memset(&H, 0, sizeof(Histogram));
392 25092 std::memset(luc, 0, 16 * sizeof(uint16_t));
393 25092 }
394
395 25092 void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel,
396 uint16x8_t& v_coarseh, size_t kMargin) {
397
4/4
✓ Branch 0 taken 6732 times.
✓ Branch 1 taken 168624 times.
✓ Branch 2 taken 18360 times.
✓ Branch 3 taken 476496 times.
670212 for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) {
398 645120 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
399 645120 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
400 645120 }
401 25092 }
402
403 497760 void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel,
404 uint16x8_t& v_coarseh) {
405 497760 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
406 497760 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
407 497760 vst1q_u16(H.coarse, v_coarsel);
408 497760 vst1q_u16(H.coarse + 8, v_coarseh);
409 497760 }
410
411 497760 void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel,
412 uint16x8_t& v_coarseh) {
413 497760 v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px));
414 497760 v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8));
415 497760 }
416
417 46404 void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
418 uint16_t& luc, size_t patch_offset, size_t kMargin,
419 size_t total_patch_span) {
420
4/4
✓ Branch 0 taken 316942 times.
✓ Branch 1 taken 12674 times.
✓ Branch 2 taken 868610 times.
✓ Branch 3 taken 33730 times.
1231956 for (luc = static_cast<uint16_t>(patch_offset - kMargin);
421 2463912 luc < static_cast<uint16_t>(
422 1231956 std::min(patch_offset + kMargin + 1, total_patch_span));
423 1185552 ++luc, px += 16) {
424 1185552 v_finel = vaddq_u16(v_finel, vld1q_u16(px));
425 1185552 v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8));
426 1185552 }
427 46404 }
428
429 451356 void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
430 uint16_t& luc, size_t patch_offset, size_t kMargin,
431 size_t total_patch_span) {
432
4/4
✓ Branch 0 taken 139342 times.
✓ Branch 1 taken 117886 times.
✓ Branch 2 taken 398789 times.
✓ Branch 3 taken 333470 times.
989487 for (; luc < static_cast<uint16_t>(patch_offset + kMargin + 1); ++luc) {
433 538131 constexpr ptrdiff_t stride = 16;
434 1076262 const ptrdiff_t patch_span_limit =
435 538131 static_cast<ptrdiff_t>(total_patch_span - 1);
436 538131 const ptrdiff_t safe_luc = static_cast<ptrdiff_t>(luc);
437
438 1076262 const ptrdiff_t base_offset =
439 538131 stride * std::min(safe_luc, patch_span_limit);
440 1076262 const ptrdiff_t old_offset =
441 1076262 stride * std::max(safe_luc - static_cast<ptrdiff_t>(2 * kMargin + 1),
442 538131 ptrdiff_t{0});
443 538131 const uint16x8_t new_vecl = vld1q_u16(px + base_offset);
444 538131 const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8);
445 538131 const uint16x8_t old_vecl = vld1q_u16(px + old_offset);
446 538131 const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8);
447 538131 v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl);
448 538131 v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech);
449 538131 }
450 451356 }
451
452 497760 size_t find_coarse_index(size_t& cdf, size_t median_index) {
453 497760 size_t coarse_index = 0;
454 4205933 while (true) {
455 4205933 cdf += H.coarse[coarse_index];
456
4/4
✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 974526 times.
✓ Branch 2 taken 367200 times.
✓ Branch 3 taken 2733647 times.
4205933 if (cdf > median_index) {
457 497760 cdf -= H.coarse[coarse_index];
458 497760 break;
459 }
460 3708173 coarse_index++;
461 }
462 995520 return coarse_index;
463 497760 }
464
465 497760 uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) {
466 497760 uint16_t* segment = H.fine[coarse_index];
467 497760 size_t fine_index = 0;
468 4340272 while (true) {
469 4340272 cdf += segment[fine_index];
470
4/4
✓ Branch 0 taken 130560 times.
✓ Branch 1 taken 979159 times.
✓ Branch 2 taken 367200 times.
✓ Branch 3 taken 2863353 times.
4340272 if (cdf > median_index) {
471 497760 fine_index = (16 * coarse_index + fine_index);
472 497760 break;
473 }
474 3842512 fine_index++;
475 }
476 995520 return uint8_t(fine_index);
477 497760 }
478
479 25092 void compute_patch_median_from_histogram(size_t h, ptrdiff_t c,
480 size_t total_patch_span,
481 size_t kMargin, size_t ksize,
482 Rows<uint8_t> dst) {
483 25092 uint16x8_t v_coarsel = vld1q_u16(H.coarse);
484 25092 uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8);
485
486 // Before starting the main patch loop to compute medians for each element,
487 // we initialize the coarse histogram buffer with the first (ksize - 1).
488 // This allows each subsequent iteration in the patch loop to perform only
489 // one addition and one subtraction.
490 25092 uint16_t* px = patched_coarse + 16 * total_patch_span * c;
491 25092 initialize_coarse(px, v_coarsel, v_coarseh, kMargin);
492
493
4/4
✓ Branch 0 taken 6732 times.
✓ Branch 1 taken 130560 times.
✓ Branch 2 taken 18360 times.
✓ Branch 3 taken 367200 times.
522852 for (size_t patch_offset = kMargin;
494 522852 patch_offset < total_patch_span - kMargin; patch_offset++) {
495 497760 size_t median_index = (ksize * ksize) / 2, cdf = 0;
496
497 995520 px = patched_coarse +
498 995520 16 * (total_patch_span * c +
499 497760 std::min(patch_offset + kMargin, total_patch_span - 1));
500 497760 increment_coarse(px, v_coarsel, v_coarseh);
501
502 // Find median at coarse level
503 497760 size_t coarse_index = find_coarse_index(cdf, median_index);
504
505 497760 uint16x8_t v_finel;
506 497760 uint16x8_t v_fineh;
507 // Check whether the fine histogram (H.fine[coarse_index]) for the
508 // current patch position needs to be freshly initialized or can be
509 // incrementally updated. This decision hinges on the `luc` (Last Used
510 // Coordinate) table, which records the last horizontal patch offset
511 // processed for each coarse bin.
512 //
513 // The condition is true in two scenarios:
514 //
515 // 1. **First-Time Initialization**:
516 // - This is the first time we are accessing this coarse bin
517 // (`coarse_index`) at the current patch position.
518 // - We compute the full fine histogram from scratch by summing
519 // over `ksize` rows centered at the patch position.
520 // - We accumulate the results into the `v_finel` and `v_fineh`
521 // vector registers.
522 // - These vectors are then stored into `H.fine[coarse_index]`.
523 // - The `luc` table is updated to reflect the last index
524 // processed.
525 //
526 // 2. **Window Movement Causes Loss of Overlap**:
527 // - The sliding window has moved enough that it no longer
528 // sufficiently overlaps the region
529 // used to compute the previously cached fine histogram (i.e.,
530 // `luc[coarse_index]` is too far behind).
531 // - We must reinitialize the fine histogram to ensure accuracy.
532 //
533 // Otherwise:
534 //
535 // - We reuse the previously computed fine histogram stored in
536 // `H.fine[coarse_index]`.
537 // - We only update it incrementally using the `update_fine()`
538 // function, which:
539 // - Adds the new values entering the window.
540 // - Subtracts the values leaving the window.
541 // - This avoids the need for a full re-scan, leveraging temporal
542 // locality between neighboring pixels.
543 //
544 // This lookup-based optimization significantly improves performance,
545 // as neighboring filter windows often overlap heavily—especially for
546 // small strides and moderate kernel sizes.
547 //
548 // After this step, the fine histogram is ready. The next phase scans
549 // `H.fine[coarse_index]` to identify the fine bin where the
550 // cumulative sum crosses the median threshold. This gives us the
551 // final median value for the output pixel.
552
4/4
✓ Branch 0 taken 12674 times.
✓ Branch 1 taken 117886 times.
✓ Branch 2 taken 33730 times.
✓ Branch 3 taken 333470 times.
497760 if (luc[coarse_index] <= patch_offset - kMargin) {
553 46404 v_finel = vdupq_n_u16(0);
554 46404 v_fineh = vdupq_n_u16(0);
555 92808 px = patched_fine + static_cast<ptrdiff_t>(16) *
556 92808 (static_cast<ptrdiff_t>(total_patch_span) *
557 92808 (16 * c + coarse_index) +
558 92808 patch_offset - kMargin);
559
560 92808 initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
561 46404 kMargin, total_patch_span);
562
563 46404 } else {
564 451356 v_finel = vld1q_u16(H.fine[coarse_index]);
565 451356 v_fineh = vld1q_u16(H.fine[coarse_index] + 8);
566 451356 px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index);
567 902712 update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
568 451356 kMargin, total_patch_span);
569 }
570
571 995520 px = patched_coarse +
572 497760 static_cast<ptrdiff_t>(16) *
573 995520 (total_patch_span * c +
574 497760 std::max(patch_offset - kMargin, static_cast<size_t>(0)));
575
576 497760 vst1q_u16(H.fine[coarse_index], v_finel);
577 497760 vst1q_u16(H.fine[coarse_index] + 8, v_fineh);
578
579 497760 decrement_coarse(px, v_coarsel, v_coarseh);
580
581 // Find median at fine level
582 1991040 dst.at(static_cast<ptrdiff_t>(h),
583 1493280 static_cast<ptrdiff_t>(patch_offset - kMargin))[c] =
584 497760 find_median(cdf, median_index, coarse_index);
585 497760 }
586 25092 }
587 };
588
589 template <bool is_single_channel>
590 105 void median_process(Rectangle image_dimensions, Rows<const uint8_t> src_rows,
591 Rows<uint8_t> dst_rows, size_t y_begin, size_t y_end,
592 size_t kernel_width, size_t kernel_height,
593 FixedBorderType border_type) {
594 105 size_t kMargin = (kernel_width - 1) / 2;
595 210 MedianBlurLargeHist<is_single_channel> median_filter{src_rows.channels(),
596 105 kMargin};
597
598 // Process left border
599 105 size_t starting_width = 0;
600 105 size_t processing_left_width = kMargin;
601 105 Point starting_left_coordinates{starting_width, y_begin};
602 105 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
603 105 median_filter.process_pixels_with_horizontal_borders(
604 105 image_dimensions, starting_left_coordinates, ending_left_coordinates,
605 105 src_rows, dst_rows, kernel_height, border_type);
606
607 // Process center region
608 105 starting_width = processing_left_width;
609 105 size_t processing_center_width = image_dimensions.width() - 2 * kMargin;
610 105 Point starting_center_coordinates{starting_width, y_begin};
611 210 Point ending_center_coordinates{starting_width + processing_center_width,
612 105 y_end};
613 105 median_filter.process_pixels_without_horizontal_borders(
614 105 image_dimensions, starting_center_coordinates, ending_center_coordinates,
615 105 src_rows, dst_rows, kernel_height, border_type);
616
617 // Process right border
618 105 starting_width = processing_left_width + processing_center_width;
619 315 size_t processing_right_width = image_dimensions.width() -
620 210 processing_left_width -
621 105 processing_center_width;
622 105 Point starting_right_coordinates{starting_width, y_begin};
623 210 Point ending_right_coordinates{starting_width + processing_right_width,
624 105 y_end};
625 105 median_filter.process_pixels_with_horizontal_borders(
626 105 image_dimensions, starting_right_coordinates, ending_right_coordinates,
627 105 src_rows, dst_rows, kernel_height, border_type);
628 105 }
629
630 105 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8(
631 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
632 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
633 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
634 105 Rectangle image_dimensions{width, height};
635 105 Rows<const uint8_t> src_rows{src, src_stride, channels};
636 105 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
637
638
2/2
✓ Branch 0 taken 57 times.
✓ Branch 1 taken 48 times.
105 if (channels == 1) {
639 114 median_process<true>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
640 57 kernel_width, kernel_height, border_type);
641 57 } else {
642 96 median_process<false>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
643 48 kernel_width, kernel_height, border_type);
644 }
645
646 105 return KLEIDICV_OK;
647 105 }
648
649 } // namespace kleidicv::neon
650