KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_large_hist_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 383 383 100.0%
Functions: 45 45 100.0%
Branches: 94 94 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 template <bool is_single_channel>
14 class MedianBlurLargeHist {
15 public:
16 140 MedianBlurLargeHist(size_t channels, size_t kMargin)
17 280 : patched_coarse{static_cast<uint16_t*>(
18 140 malloc((16 + 256) * (patch_length + 2 * kMargin) * channels *
19 sizeof(uint16_t)))},
20 280 patched_fine{
21 140 &patched_coarse[16 * (patch_length + 2 * kMargin) * channels]},
22 140 H{},
23 140 luc{} {}
24
25 140 ~MedianBlurLargeHist() { free(patched_coarse); }
26
27 140 void process_pixels_without_horizontal_borders(
28 Rectangle image_dimensions, Point starting_coordinates,
29 Point ending_coordinates, Rows<const uint8_t> src_rows,
30 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
31 140 const size_t kMargin = ksize / 2;
32
33
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 64 times.
280 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
34 140 w += patch_length) {
35 280 const size_t total_patch_span =
36 140 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
37
38 280 Rows<uint8_t> shifted_dst{
39 140 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
40 140 dst_rows.stride(), dst_rows.channels()};
41
42 140 clear_patched_histogram(total_patch_span * src_rows.channels());
43
44
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 64 times.
408 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
45 268 ++c) {
46 268 clear_lookup_table();
47
48 // We initialize with ksize rows to allow merging of
49 // histogram increment and decrement operations in the main loop.
50 // This extra initial load enables a single update phase and avoids
51 // splitting the logic into separate steps.
52
4/4
✓ Branch 0 taken 1868 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 192 times.
7128 for (size_t r = 0; r < ksize; ++r) {
53 13720 const ptrdiff_t valid_h =
54 13720 get_physical_index(starting_coordinates.y() + r - kMargin,
55 6860 image_dimensions.height(), border_type);
56 6860 initialize_patched_histogram_without_horizontal_borders(
57 6860 src_rows, c, valid_h, total_patch_span, w, kMargin);
58 6860 }
59 536 compute_patch_median_from_histogram(starting_coordinates.y(), c,
60 268 total_patch_span, kMargin, ksize,
61 268 shifted_dst);
62 268 }
63
64
4/4
✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 2656 times.
✓ Branch 3 taken 64 times.
5712 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
65 5572 ++h) {
66 11144 const ptrdiff_t valid_old_h = get_physical_index(
67 5572 h - kMargin - 1, image_dimensions.height(), border_type);
68
69 11144 const ptrdiff_t valid_new_h = get_physical_index(
70 5572 h + kMargin, image_dimensions.height(), border_type);
71
72
4/4
✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 2916 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 2656 times.
16456 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
73 10884 ++c) {
74 10884 clear_lookup_table();
75
76 10884 update_patch_histogram_without_horizontal_borders(
77 10884 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
78 10884 kMargin);
79
80 21768 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
81 10884 ksize, shifted_dst);
82 10884 }
83 5572 }
84 140 }
85 140 }
86
87 280 void process_pixels_with_horizontal_borders(
88 Rectangle image_dimensions, Point starting_coordinates,
89 Point ending_coordinates, Rows<const uint8_t> src_rows,
90 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
91 280 const size_t kMargin = ksize / 2;
92
93
4/4
✓ Branch 0 taken 152 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 128 times.
✓ Branch 3 taken 128 times.
560 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
94 280 w += patch_length) {
95 560 const size_t total_patch_span =
96 280 std::min(ending_coordinates.x() - w, patch_length) + kMargin * 2;
97 560 Rows<uint8_t> shifted_dst{
98 280 &dst_rows[0] + static_cast<ptrdiff_t>(w) * dst_rows.channels(),
99 280 dst_rows.stride(), dst_rows.channels()};
100
101 280 clear_patched_histogram(total_patch_span * src_rows.channels());
102
103
4/4
✓ Branch 0 taken 152 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 384 times.
✓ Branch 3 taken 128 times.
816 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
104 536 ++c) {
105 536 clear_lookup_table();
106 // We initialize with ksize rows to allow merging of
107 // histogram increment and decrement operations in the main loop.
108 // This extra initial load enables a single update phase and avoids
109 // splitting the logic into separate steps.
110
4/4
✓ Branch 0 taken 3736 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 384 times.
14256 for (size_t r = 0; r < ksize; ++r) {
111 27440 const ptrdiff_t valid_h =
112 27440 get_physical_index(starting_coordinates.y() + r - kMargin,
113 13720 image_dimensions.height(), border_type);
114 13720 initialize_patched_histogram_with_horizontal_borders(
115 13720 src_rows, c, valid_h, total_patch_span, w, kMargin,
116 13720 image_dimensions.width(), border_type);
117 13720 }
118 1072 compute_patch_median_from_histogram(starting_coordinates.y(), c,
119 536 total_patch_span, kMargin, ksize,
120 536 shifted_dst);
121 536 }
122
123
4/4
✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 152 times.
✓ Branch 2 taken 5312 times.
✓ Branch 3 taken 128 times.
11424 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
124 11144 ++h) {
125 22288 const ptrdiff_t valid_old_h = get_physical_index(
126 11144 h - kMargin - 1, image_dimensions.height(), border_type);
127 22288 const ptrdiff_t valid_new_h = get_physical_index(
128 11144 h + kMargin, image_dimensions.height(), border_type);
129
4/4
✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 5832 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 5312 times.
32912 for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(src_rows.channels());
130 21768 ++c) {
131 21768 clear_lookup_table();
132
133 21768 update_patch_histogram_with_horizontal_borders(
134 21768 src_rows, c, valid_old_h, valid_new_h, total_patch_span, w,
135 21768 kMargin, image_dimensions.width(), border_type);
136
137 43536 compute_patch_median_from_histogram(h, c, total_patch_span, kMargin,
138 21768 ksize, shifted_dst);
139 21768 }
140 11144 }
141 280 }
142 280 }
143
144 private:
145 // Histogram Buffer Layout Explanation
146 // -----------------------------------
147 // patched_coarse:
148 // - Conceptually a 3D array:
149 // coarse[channel_idx][patch_offset][coarse_bin]
150 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
151 // - Flattened as:
152 // coarse_offset = 16 * (patch_length * channel_idx + patch_offset)
153 // + (incoming_pixel >> 4);
154 //
155 // patched_fine:
156 // - Conceptually a 4D array:
157 // fine[channel_idx][coarse_bin][patch_offset][fine_bin]
158 // - coarse_bin = incoming_pixel >> 4 ∈ [0, 15]
159 // - fine_bin = incoming_pixel & 0xF ∈ [0, 15]
160 // - Flattened as:
161 // fine_offset = 16 * (patch_length * (16 * channel_idx + coarse_bin)
162 // + patch_offset) + fine_bin;
163 //
164 // This layout enables fast linear access while preserving the hierarchical
165 // structure of histograms per channel and patch position.
166 constexpr static size_t patch_length = 512;
167 uint16_t* patched_coarse;
168 uint16_t* patched_fine;
169
170 // Clear only the portion of the patched histogram buffers that will be used
171 // for the current patch.
172 // Since these buffers are large, there's no need to zero out the entire
173 // allocation— only the section relevant to the current total patch size is
174 // cleared for efficiency.
175 420 void clear_patched_histogram(size_t total_patch_size) {
176 840 std::memset(patched_coarse, 0,
177 420 16 * total_patch_size * sizeof(patched_coarse[0]));
178 840 std::memset(patched_fine, 0,
179 420 256 * total_patch_size * sizeof(patched_fine[0]));
180 420 }
181
182 892320 void scalar_initialize_patched_histogram(int incoming_pixel,
183 size_t channel_idx,
184 size_t patch_length,
185 size_t patch_offset) {
186 1784640 const size_t coarse_offset =
187 1784640 16 * (patch_length * channel_idx + patch_offset) +
188 892320 (incoming_pixel >> 4);
189 1784640 const size_t fine_offset =
190 1784640 16 * (patch_length * (16 * channel_idx + (incoming_pixel >> 4)) +
191 1784640 patch_offset) +
192 892320 (incoming_pixel & 0xF);
193 892320 patched_coarse[coarse_offset]++;
194 892320 patched_fine[fine_offset]++;
195 892320 }
196
197 5400 void vector_initialize_patched_histogram(uint8x16_t& incoming_pixels,
198 size_t channel_idx,
199 size_t patch_length,
200 size_t patch_offset) {
201 KLEIDICV_FORCE_LOOP_UNROLL
202
2/2
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 86400 times.
91800 for (int i = 0; i < 16; i++) {
203 172800 const size_t coarse_offset_incoming =
204 172800 16 * (patch_length * channel_idx + patch_offset + i) +
205 86400 (incoming_pixels[i] >> 4);
206
207 172800 const size_t fine_offset_incoming =
208 259200 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
209 259200 patch_offset + i) +
210 86400 (incoming_pixels[i] & 0xF);
211
212 86400 patched_coarse[coarse_offset_incoming]++;
213 86400 patched_fine[fine_offset_incoming]++;
214 86400 }
215 5400 }
216
217 6860 void initialize_patched_histogram_without_horizontal_borders(
218 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
219 size_t total_patch_span, size_t starting_width, size_t kMargin) {
220 6860 size_t vector_part = 0;
221 if constexpr (is_single_channel) {
222 1868 vector_part = (total_patch_span >> 4) << 4;
223
2/2
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1868 times.
7268 for (size_t patch_offset = 0; patch_offset < vector_part;
224 5400 patch_offset += 16) {
225 10800 const ptrdiff_t valid_w =
226 5400 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
227 5400 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_h, valid_w)[c]);
228 10800 vector_initialize_patched_histogram(incoming_pixels, c,
229 5400 total_patch_span, patch_offset);
230 5400 }
231 }
232
233
4/4
✓ Branch 0 taken 1868 times.
✓ Branch 1 taken 21600 times.
✓ Branch 2 taken 4992 times.
✓ Branch 3 taken 299520 times.
327980 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
234 321120 ++patch_offset) {
235 642240 const ptrdiff_t valid_w =
236 321120 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
237 321120 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
238
239 642240 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
240 321120 patch_offset);
241 321120 }
242 6860 }
243
244 13720 void initialize_patched_histogram_with_horizontal_borders(
245 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_h,
246 size_t total_patch_span, size_t starting_width, size_t kMargin,
247 size_t width, FixedBorderType border_type) {
248
4/4
✓ Branch 0 taken 3736 times.
✓ Branch 1 taken 150144 times.
✓ Branch 2 taken 9984 times.
✓ Branch 3 taken 421056 times.
584920 for (size_t patch_offset = 0; patch_offset < total_patch_span;
249 571200 ++patch_offset) {
250 1142400 ptrdiff_t valid_w = get_physical_index(
251 571200 starting_width + patch_offset - kMargin, width, border_type);
252
253 571200 auto incoming_pixel = src_rows.at(valid_h, valid_w)[c];
254
255 1142400 scalar_initialize_patched_histogram(incoming_pixel, c, total_patch_span,
256 571200 patch_offset);
257 571200 }
258 13720 }
259
260 // During vertical traversal (the main 'height' loop), each sliding window
261 // iteration introduces a new incoming row and removes an outgoing one. The
262 // histogram must be updated accordingly by subtracting the contributions of
263 // the outgoing row and adding those of the incoming row.
264 // Both increment and decrement operations are handled inside the same
265 // function.
266 1352416 void scalar_update_patch_histogram(int outgoing_pixel, int incoming_pixel,
267 size_t channel_idx, size_t patch_length,
268 size_t patch_offset) {
269 2704832 const size_t coarse_offset_base =
270 1352416 16 * (patch_length * channel_idx + patch_offset);
271 2704832 const size_t fine_offset_base =
272 1352416 16 * patch_length * 16 * channel_idx + 16 * patch_offset;
273
274 1352416 const size_t pixel_new_shift_right_4 = (incoming_pixel >> 4);
275 1352416 const size_t mask_new_pixel = (incoming_pixel & 0xF);
276 1352416 const size_t pixel_old_shift_right_4 = (outgoing_pixel >> 4);
277 1352416 const size_t mask_old_pixel = (outgoing_pixel & 0xF);
278
279 2704832 const size_t fine_new_offset = fine_offset_base + mask_new_pixel +
280 1352416 16 * patch_length * pixel_new_shift_right_4;
281 2704832 const size_t coarse_new_offset =
282 1352416 coarse_offset_base + pixel_new_shift_right_4;
283
284 2704832 const size_t fine_old_offset = fine_offset_base + mask_old_pixel +
285 1352416 16 * patch_length * pixel_old_shift_right_4;
286 2704832 const size_t coarse_old_offset =
287 1352416 coarse_offset_base + pixel_old_shift_right_4;
288
289 1352416 patched_coarse[coarse_new_offset]++;
290 1352416 patched_coarse[coarse_old_offset]--;
291 1352416 patched_fine[fine_new_offset]++;
292 1352416 patched_fine[fine_old_offset]--;
293 1352416 }
294
295 8488 void vector_update_patch_histogram(uint8x16_t& outgoing_pixels,
296 uint8x16_t& incoming_pixels,
297 size_t channel_idx, size_t patch_length,
298 size_t patch_offset) {
299 KLEIDICV_FORCE_LOOP_UNROLL
300
2/2
✓ Branch 0 taken 8488 times.
✓ Branch 1 taken 135808 times.
144296 for (int i = 0; i < 16; i++) {
301 271616 const size_t coarse_offset_incoming =
302 271616 16 * (patch_length * channel_idx + patch_offset + i) +
303 135808 (incoming_pixels[i] >> 4);
304
305 271616 const size_t coarse_offset_outgoing =
306 271616 16 * (patch_length * channel_idx + patch_offset + i) +
307 135808 (outgoing_pixels[i] >> 4);
308
309 271616 const size_t fine_offset_incoming =
310 407424 16 * (patch_length * (16 * channel_idx + (incoming_pixels[i] >> 4)) +
311 407424 patch_offset + i) +
312 135808 (incoming_pixels[i] & 0xF);
313
314 271616 const size_t fine_offset_outgoing =
315 407424 16 * (patch_length * (16 * channel_idx + (outgoing_pixels[i] >> 4)) +
316 407424 patch_offset + i) +
317 135808 (outgoing_pixels[i] & 0xF);
318
319 135808 patched_coarse[coarse_offset_incoming]++;
320 135808 patched_coarse[coarse_offset_outgoing]--;
321 135808 patched_fine[fine_offset_incoming]++;
322 135808 patched_fine[fine_offset_outgoing]--;
323 135808 }
324 8488 }
325
326 10884 void update_patch_histogram_without_horizontal_borders(
327 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
328 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
329 size_t kMargin) {
330 10884 size_t vector_part = 0;
331 if constexpr (is_single_channel) {
332 2916 vector_part = (total_patch_span >> 4) << 4;
333
2/2
✓ Branch 0 taken 8488 times.
✓ Branch 1 taken 2916 times.
11404 for (size_t patch_offset = 0; patch_offset < vector_part;
334 8488 patch_offset += 16) {
335 16976 const ptrdiff_t valid_w =
336 8488 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
337 8488 auto outgoing_pixels = vld1q_u8(&src_rows.at(valid_old_h, valid_w)[c]);
338 8488 auto incoming_pixels = vld1q_u8(&src_rows.at(valid_new_h, valid_w)[c]);
339 16976 vector_update_patch_histogram(outgoing_pixels, incoming_pixels, c,
340 8488 total_patch_span, patch_offset);
341 8488 }
342 }
343
344
4/4
✓ Branch 0 taken 2916 times.
✓ Branch 1 taken 33952 times.
✓ Branch 2 taken 7968 times.
✓ Branch 3 taken 478080 times.
522916 for (size_t patch_offset = vector_part; patch_offset < total_patch_span;
345 512032 ++patch_offset) {
346 1024064 const ptrdiff_t valid_w =
347 512032 static_cast<ptrdiff_t>(starting_width + patch_offset - kMargin);
348 512032 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
349 512032 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
350 1024064 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
351 512032 total_patch_span, patch_offset);
352 512032 }
353 10884 }
354
355 21768 void update_patch_histogram_with_horizontal_borders(
356 Rows<const uint8_t> src_rows, ptrdiff_t c, ptrdiff_t valid_old_h,
357 ptrdiff_t valid_new_h, size_t total_patch_span, size_t starting_width,
358 size_t kMargin, size_t width, FixedBorderType border_type) {
359
4/4
✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 219456 times.
✓ Branch 2 taken 15936 times.
✓ Branch 3 taken 620928 times.
862152 for (size_t patch_offset = 0; patch_offset < total_patch_span;
360 840384 ++patch_offset) {
361 1680768 const ptrdiff_t valid_w = get_physical_index(
362 840384 starting_width + patch_offset - kMargin, width, border_type);
363 840384 auto outgoing_pixel = src_rows.at(valid_old_h, valid_w)[c];
364 840384 auto incoming_pixel = src_rows.at(valid_new_h, valid_w)[c];
365 1680768 scalar_update_patch_histogram(outgoing_pixel, incoming_pixel, c,
366 840384 total_patch_span, patch_offset);
367 840384 }
368 21768 }
369
370 // `H` and `luc` are used for computing the median value for a single
371 // output pixel:
372 // - `H` is a histogram structure holding both the coarse and fine
373 // bins needed
374 // for the current element's median calculation.
375 // - `luc` is a lookup table that stores the last processed offset
376 // (index)
377 // for each coarse bin. This allows incremental fine histogram
378 // updates instead of full recalculation when histogram overlap is
379 // high. Since neighboring patches in natural images often have
380 // similar pixel values, reusing previous histogram state can
381 // significantly reduce processing time.
382 typedef struct {
383 uint16_t coarse[16];
384 uint16_t fine[16][16];
385 } Histogram;
386
387 Histogram H;
388 uint16_t luc[16];
389
390 33456 void clear_lookup_table(void) {
391 33456 std::memset(&H, 0, sizeof(Histogram));
392 33456 std::memset(luc, 0, 16 * sizeof(uint16_t));
393 33456 }
394
395 33456 void initialize_coarse(uint16_t* px, uint16x8_t& v_coarsel,
396 uint16x8_t& v_coarseh, size_t kMargin) {
397
4/4
✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 224832 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 635328 times.
893616 for (size_t i = 0; i < 2 * kMargin; ++i, px += 16) {
398 860160 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
399 860160 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
400 860160 }
401 33456 }
402
403 663680 void increment_coarse(uint16_t* px, uint16x8_t& v_coarsel,
404 uint16x8_t& v_coarseh) {
405 663680 v_coarsel = vaddq_u16(v_coarsel, vld1q_u16(px));
406 663680 v_coarseh = vaddq_u16(v_coarseh, vld1q_u16(px + 8));
407 663680 vst1q_u16(H.coarse, v_coarsel);
408 663680 vst1q_u16(H.coarse + 8, v_coarseh);
409 663680 }
410
411 663680 void decrement_coarse(uint16_t* px, uint16x8_t& v_coarsel,
412 uint16x8_t& v_coarseh) {
413 663680 v_coarsel = vsubq_u16(v_coarsel, vld1q_u16(px));
414 663680 v_coarseh = vsubq_u16(v_coarseh, vld1q_u16(px + 8));
415 663680 }
416
417 60503 void initialize_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
418 uint16_t& luc, size_t patch_offset, size_t kMargin,
419 size_t total_patch_span) {
420
4/4
✓ Branch 0 taken 414508 times.
✓ Branch 1 taken 16520 times.
✓ Branch 2 taken 1143243 times.
✓ Branch 3 taken 43983 times.
1618254 for (luc = static_cast<uint16_t>(patch_offset - kMargin);
421 3236508 luc < static_cast<uint16_t>(
422 1618254 std::min(patch_offset + kMargin + 1, total_patch_span));
423 1557751 ++luc, px += 16) {
424 1557751 v_finel = vaddq_u16(v_finel, vld1q_u16(px));
425 1557751 v_fineh = vaddq_u16(v_fineh, vld1q_u16(px + 8));
426 1557751 }
427 60503 }
428
429 603177 void update_fine(uint16_t* px, uint16x8_t& v_finel, uint16x8_t& v_fineh,
430 uint16_t& luc, size_t patch_offset, size_t kMargin,
431 size_t total_patch_span) {
432
4/4
✓ Branch 0 taken 190121 times.
✓ Branch 1 taken 157560 times.
✓ Branch 2 taken 535241 times.
✓ Branch 3 taken 445617 times.
1328539 for (; luc < static_cast<uint16_t>(patch_offset + kMargin + 1); ++luc) {
433 725362 constexpr ptrdiff_t stride = 16;
434 1450724 const ptrdiff_t patch_span_limit =
435 725362 static_cast<ptrdiff_t>(total_patch_span - 1);
436 725362 const ptrdiff_t safe_luc = static_cast<ptrdiff_t>(luc);
437
438 1450724 const ptrdiff_t base_offset =
439 725362 stride * std::min(safe_luc, patch_span_limit);
440 1450724 const ptrdiff_t old_offset =
441 1450724 stride * std::max(safe_luc - static_cast<ptrdiff_t>(2 * kMargin + 1),
442 725362 ptrdiff_t{0});
443 725362 const uint16x8_t new_vecl = vld1q_u16(px + base_offset);
444 725362 const uint16x8_t new_vech = vld1q_u16(px + base_offset + 8);
445 725362 const uint16x8_t old_vecl = vld1q_u16(px + old_offset);
446 725362 const uint16x8_t old_vech = vld1q_u16(px + old_offset + 8);
447 725362 v_finel = vsubq_u16(vaddq_u16(v_finel, new_vecl), old_vecl);
448 725362 v_fineh = vsubq_u16(vaddq_u16(v_fineh, new_vech), old_vech);
449 725362 }
450 603177 }
451
452 663680 size_t find_coarse_index(size_t& cdf, size_t median_index) {
453 663680 size_t coarse_index = 0;
454 5700229 while (true) {
455 5700229 cdf += H.coarse[coarse_index];
456
4/4
✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1320228 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3716321 times.
5700229 if (cdf > median_index) {
457 663680 cdf -= H.coarse[coarse_index];
458 663680 break;
459 }
460 5036549 coarse_index++;
461 }
462 1327360 return coarse_index;
463 663680 }
464
465 663680 uint8_t find_median(size_t& cdf, size_t median_index, size_t coarse_index) {
466 663680 uint16_t* segment = H.fine[coarse_index];
467 663680 size_t fine_index = 0;
468 5347359 while (true) {
469 5347359 cdf += segment[fine_index];
470
4/4
✓ Branch 0 taken 174080 times.
✓ Branch 1 taken 1233097 times.
✓ Branch 2 taken 489600 times.
✓ Branch 3 taken 3450582 times.
5347359 if (cdf > median_index) {
471 663680 fine_index = (16 * coarse_index + fine_index);
472 663680 break;
473 }
474 4683679 fine_index++;
475 }
476 1327360 return uint8_t(fine_index);
477 663680 }
478
479 33456 void compute_patch_median_from_histogram(size_t h, ptrdiff_t c,
480 size_t total_patch_span,
481 size_t kMargin, size_t ksize,
482 Rows<uint8_t> dst) {
483 33456 uint16x8_t v_coarsel = vld1q_u16(H.coarse);
484 33456 uint16x8_t v_coarseh = vld1q_u16(H.coarse + 8);
485
486 // Before starting the main patch loop to compute medians for each element,
487 // we initialize the coarse histogram buffer with the first (ksize - 1).
488 // This allows each subsequent iteration in the patch loop to perform only
489 // one addition and one subtraction.
490 33456 uint16_t* px = patched_coarse + 16 * total_patch_span * c;
491 33456 initialize_coarse(px, v_coarsel, v_coarseh, kMargin);
492
493
4/4
✓ Branch 0 taken 8976 times.
✓ Branch 1 taken 174080 times.
✓ Branch 2 taken 24480 times.
✓ Branch 3 taken 489600 times.
697136 for (size_t patch_offset = kMargin;
494 697136 patch_offset < total_patch_span - kMargin; patch_offset++) {
495 663680 size_t median_index = (ksize * ksize) / 2, cdf = 0;
496
497 1327360 px = patched_coarse +
498 1327360 16 * (total_patch_span * c +
499 663680 std::min(patch_offset + kMargin, total_patch_span - 1));
500 663680 increment_coarse(px, v_coarsel, v_coarseh);
501
502 // Find median at coarse level
503 663680 size_t coarse_index = find_coarse_index(cdf, median_index);
504
505 663680 uint16x8_t v_finel;
506 663680 uint16x8_t v_fineh;
507 // Check whether the fine histogram (H.fine[coarse_index]) for the
508 // current patch position needs to be freshly initialized or can be
509 // incrementally updated. This decision hinges on the `luc` (Last Used
510 // Coordinate) table, which records the last horizontal patch offset
511 // processed for each coarse bin.
512 //
513 // The condition is true in two scenarios:
514 //
515 // 1. **First-Time Initialization**:
516 // - This is the first time we are accessing this coarse bin
517 // (`coarse_index`) at the current patch position.
518 // - We compute the full fine histogram from scratch by summing
519 // over `ksize` rows centered at the patch position.
520 // - We accumulate the results into the `v_finel` and `v_fineh`
521 // vector registers.
522 // - These vectors are then stored into `H.fine[coarse_index]`.
523 // - The `luc` table is updated to reflect the last index
524 // processed.
525 //
526 // 2. **Window Movement Causes Loss of Overlap**:
527 // - The sliding window has moved enough that it no longer
528 // sufficiently overlaps the region
529 // used to compute the previously cached fine histogram (i.e.,
530 // `luc[coarse_index]` is too far behind).
531 // - We must reinitialize the fine histogram to ensure accuracy.
532 //
533 // Otherwise:
534 //
535 // - We reuse the previously computed fine histogram stored in
536 // `H.fine[coarse_index]`.
537 // - We only update it incrementally using the `update_fine()`
538 // function, which:
539 // - Adds the new values entering the window.
540 // - Subtracts the values leaving the window.
541 // - This avoids the need for a full re-scan, leveraging temporal
542 // locality between neighboring pixels.
543 //
544 // This lookup-based optimization significantly improves performance,
545 // as neighboring filter windows often overlap heavily—especially for
546 // small strides and moderate kernel sizes.
547 //
548 // After this step, the fine histogram is ready. The next phase scans
549 // `H.fine[coarse_index]` to identify the fine bin where the
550 // cumulative sum crosses the median threshold. This gives us the
551 // final median value for the output pixel.
552
4/4
✓ Branch 0 taken 16520 times.
✓ Branch 1 taken 157560 times.
✓ Branch 2 taken 43983 times.
✓ Branch 3 taken 445617 times.
663680 if (luc[coarse_index] <= patch_offset - kMargin) {
553 60503 v_finel = vdupq_n_u16(0);
554 60503 v_fineh = vdupq_n_u16(0);
555 121006 px = patched_fine + static_cast<ptrdiff_t>(16) *
556 121006 (static_cast<ptrdiff_t>(total_patch_span) *
557 121006 (16 * c + coarse_index) +
558 121006 patch_offset - kMargin);
559
560 121006 initialize_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
561 60503 kMargin, total_patch_span);
562
563 60503 } else {
564 603177 v_finel = vld1q_u16(H.fine[coarse_index]);
565 603177 v_fineh = vld1q_u16(H.fine[coarse_index] + 8);
566 603177 px = patched_fine + 16 * total_patch_span * (16 * c + coarse_index);
567 1206354 update_fine(px, v_finel, v_fineh, luc[coarse_index], patch_offset,
568 603177 kMargin, total_patch_span);
569 }
570
571 1327360 px = patched_coarse +
572 663680 static_cast<ptrdiff_t>(16) *
573 1327360 (total_patch_span * c +
574 663680 std::max(patch_offset - kMargin, static_cast<size_t>(0)));
575
576 663680 vst1q_u16(H.fine[coarse_index], v_finel);
577 663680 vst1q_u16(H.fine[coarse_index] + 8, v_fineh);
578
579 663680 decrement_coarse(px, v_coarsel, v_coarseh);
580
581 // Find median at fine level
582 2654720 dst.at(static_cast<ptrdiff_t>(h),
583 1991040 static_cast<ptrdiff_t>(patch_offset - kMargin))[c] =
584 663680 find_median(cdf, median_index, coarse_index);
585 663680 }
586 33456 }
587 };
588
589 template <bool is_single_channel>
590 140 void median_process(Rectangle image_dimensions, Rows<const uint8_t> src_rows,
591 Rows<uint8_t> dst_rows, size_t y_begin, size_t y_end,
592 size_t kernel_width, size_t kernel_height,
593 FixedBorderType border_type) {
594 140 size_t kMargin = (kernel_width - 1) / 2;
595 280 MedianBlurLargeHist<is_single_channel> median_filter{src_rows.channels(),
596 140 kMargin};
597
598 // Process left border
599 140 size_t starting_width = 0;
600 140 size_t processing_left_width = kMargin;
601 140 Point starting_left_coordinates{starting_width, y_begin};
602 140 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
603 140 median_filter.process_pixels_with_horizontal_borders(
604 140 image_dimensions, starting_left_coordinates, ending_left_coordinates,
605 140 src_rows, dst_rows, kernel_height, border_type);
606
607 // Process center region
608 140 starting_width = processing_left_width;
609 140 size_t processing_center_width = image_dimensions.width() - 2 * kMargin;
610 140 Point starting_center_coordinates{starting_width, y_begin};
611 280 Point ending_center_coordinates{starting_width + processing_center_width,
612 140 y_end};
613 140 median_filter.process_pixels_without_horizontal_borders(
614 140 image_dimensions, starting_center_coordinates, ending_center_coordinates,
615 140 src_rows, dst_rows, kernel_height, border_type);
616
617 // Process right border
618 140 starting_width = processing_left_width + processing_center_width;
619 420 size_t processing_right_width = image_dimensions.width() -
620 280 processing_left_width -
621 140 processing_center_width;
622 140 Point starting_right_coordinates{starting_width, y_begin};
623 280 Point ending_right_coordinates{starting_width + processing_right_width,
624 140 y_end};
625 140 median_filter.process_pixels_with_horizontal_borders(
626 140 image_dimensions, starting_right_coordinates, ending_right_coordinates,
627 140 src_rows, dst_rows, kernel_height, border_type);
628 140 }
629
630 140 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_large_hist_stripe_u8(
631 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
632 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
633 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
634 140 Rectangle image_dimensions{width, height};
635 140 Rows<const uint8_t> src_rows{src, src_stride, channels};
636 140 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
637
638
2/2
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 64 times.
140 if (channels == 1) {
639 152 median_process<true>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
640 76 kernel_width, kernel_height, border_type);
641 76 } else {
642 128 median_process<false>(image_dimensions, src_rows, dst_rows, y_begin, y_end,
643 64 kernel_width, kernel_height, border_type);
644 }
645
646 140 return KLEIDICV_OK;
647 140 }
648
649 } // namespace kleidicv::neon
650