KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_small_hist_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 225 225 100.0%
Functions: 14 14 100.0%
Branches: 40 40 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 // B. Weiss, "Fast Median and Bilateral Filtering," in *ACM SIGGRAPH 2006
14 // Papers*, ACM, New York, NY, USA, pp. 519–526, 2006.
15 // The paper is currently available at:
16 // http://mesh.brown.edu/engn1610/refs/Weiss-siggraph2006.pdf
17 class MedianBlurSmallHist {
18 public:
19 136 MedianBlurSmallHist() : fine{}, coarse{} {}
20
21 272 void process_pixels_with_horizontal_borders(
22 Rectangle image_dimensions, Point starting_coordinates,
23 Point ending_coordinates, Rows<const uint8_t> src_rows,
24 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
25 272 const size_t kMargin = ksize / 2;
26
27
2/2
✓ Branch 0 taken 272 times.
✓ Branch 1 taken 1920 times.
2192 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w++) {
28
2/2
✓ Branch 0 taken 3648 times.
✓ Branch 1 taken 1920 times.
5568 for (ptrdiff_t ch = 0; ch < static_cast<ptrdiff_t>(src_rows.channels());
29 3648 ch++) {
30 3648 scalar_clear_histogram();
31
32 // We initialize with ksize rows to allow merging of
33 // histogram increment and decrement operations in the main loop.
34 // This extra initial load enables a single update phase and avoids
35 // splitting the logic into separate steps.
36
2/2
✓ Branch 0 taken 41472 times.
✓ Branch 1 taken 3648 times.
45120 for (size_t r = 0; r < ksize; r++) {
37
2/2
✓ Branch 0 taken 502848 times.
✓ Branch 1 taken 41472 times.
544320 for (size_t c = 0; c < ksize; c++) {
38 1005696 const ptrdiff_t valid_h =
39 1005696 get_physical_index(starting_coordinates.y() + r - kMargin,
40 502848 image_dimensions.height(), border_type);
41 1005696 const ptrdiff_t valid_w = get_physical_index(
42 502848 w + c - kMargin, image_dimensions.width(), border_type);
43
44 502848 uint8_t pixel = src_rows.at(valid_h, valid_w)[ch];
45
46 502848 scalar_initialize_histogram(pixel);
47 502848 }
48 41472 }
49
50 3648 const uint8_t median_value = scalar_find_median(ksize);
51
52 14592 dst_rows.at(static_cast<ptrdiff_t>(starting_coordinates.y()),
53 14592 static_cast<ptrdiff_t>(w))[ch] = median_value;
54
55
2/2
✓ Branch 0 taken 62656 times.
✓ Branch 1 taken 3648 times.
66304 for (size_t h = starting_coordinates.y() + 1;
56 66304 h < ending_coordinates.y(); h++) {
57 125312 const ptrdiff_t valid_new_h = get_physical_index(
58 62656 h + kMargin, image_dimensions.height(), border_type);
59
60 125312 const ptrdiff_t valid_old_h = get_physical_index(
61 62656 h - kMargin - 1, image_dimensions.height(), border_type);
62
63
2/2
✓ Branch 0 taken 728064 times.
✓ Branch 1 taken 62656 times.
790720 for (size_t c = 0; c < ksize; c++) {
64 1456128 const ptrdiff_t valid_w = get_physical_index(
65 728064 w + c - kMargin, image_dimensions.width(), border_type);
66
67 728064 uint8_t incoming_pixel = src_rows.at(valid_new_h, valid_w)[ch];
68
69 728064 uint8_t outgoing_pixel = src_rows.at(valid_old_h, valid_w)[ch];
70
71 728064 scalar_update_histogram(incoming_pixel, outgoing_pixel);
72 728064 }
73
74 62656 const uint8_t median_value = scalar_find_median(ksize);
75
76 250624 dst_rows.at(static_cast<ptrdiff_t>(h),
77 250624 static_cast<ptrdiff_t>(w))[ch] = median_value;
78 62656 }
79 3648 }
80 1920 }
81 272 }
82
83 136 void process_pixels_without_horizontal_borders(
84 Rectangle image_dimensions, Point starting_coordinates,
85 Point ending_coordinates, Rows<const uint8_t> src_rows,
86 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
87 136 const size_t step = sizeof(uint8x16_t);
88 136 const size_t KMargin_w = (ksize / 2) * src_rows.channels();
89 136 const size_t KMargin_h = (ksize / 2);
90
91
2/2
✓ Branch 0 taken 136 times.
✓ Branch 1 taken 368 times.
504 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
92 368 w += step) {
93 368 vector_clear_histogram();
94
95 // We initialize with ksize rows to allow merging of
96 // histogram increment and decrement operations in the main loop.
97 // This extra initial load enables a single update phase and avoids
98 // splitting the logic into separate steps.
99
2/2
✓ Branch 0 taken 4272 times.
✓ Branch 1 taken 368 times.
4640 for (size_t r = 0; r < ksize; r++) {
100 8544 const ptrdiff_t vertical_index =
101 8544 get_physical_index(starting_coordinates.y() + r - KMargin_h,
102 4272 image_dimensions.height(), border_type);
103
104
2/2
✓ Branch 0 taken 52848 times.
✓ Branch 1 taken 4272 times.
57120 for (size_t c = 0; c < ksize; c++) {
105 105696 const size_t horizontal_index =
106 52848 w + c * src_rows.channels() - KMargin_w;
107
108 105696 uint8x16_t pixel = vld1q_u8(
109 52848 &src_rows[vertical_index * src_rows.stride() + horizontal_index]);
110
111 52848 vector_initialize_histogram(pixel);
112 52848 }
113 4272 }
114
115 368 const uint8x16_t median_value = vector_find_median(ksize);
116
117 736 vst1q_u8(&dst_rows[starting_coordinates.y() * dst_rows.stride() + w],
118 368 median_value);
119
120
2/2
✓ Branch 0 taken 6576 times.
✓ Branch 1 taken 368 times.
6944 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
121 6576 ++h) {
122 13152 const ptrdiff_t vertical_index_new = get_physical_index(
123 6576 h + KMargin_h, image_dimensions.height(), border_type);
124
125 13152 const ptrdiff_t vertical_index_old = get_physical_index(
126 6576 h - KMargin_h - 1, image_dimensions.height(), border_type);
127
128
2/2
✓ Branch 0 taken 77424 times.
✓ Branch 1 taken 6576 times.
84000 for (size_t c = 0; c < ksize; c++) {
129 77424 size_t horizontal_index = w + c * src_rows.channels() - KMargin_w;
130
131 154848 uint8x16_t incoming_pixels =
132 154848 vld1q_u8(&src_rows[vertical_index_new * src_rows.stride() +
133 77424 horizontal_index]);
134
135 154848 uint8x16_t outgoing_pixels =
136 154848 vld1q_u8(&src_rows[vertical_index_old * src_rows.stride() +
137 77424 horizontal_index]);
138
139 77424 vector_update_histogram(incoming_pixels, outgoing_pixels);
140 77424 }
141
142 6576 const uint8x16_t median_value = vector_find_median(ksize);
143
144 6576 vst1q_u8(&dst_rows[h * dst_rows.stride() + w], median_value);
145 6576 }
146 368 }
147 136 }
148
149 private:
150 // The 'fine' and 'coarse' histograms are shared between both scalar and
151 // vector operations. Their buffer sizes are allocated based on the
152 // vectorized case to ensure compatibility and avoid reallocation.
153 // In case of vectorized execution, 'fine' and 'coarse' are actually
154 // implemented as 16 interleaved histograms, one per vector lane.
155 uint8_t fine[4096];
156 uint8_t coarse[256];
157
158 // In scalar_clear_histogram, we only clear the relevant portions of the
159 // 'fine' and 'coarse' buffers that are actually used during computation. This
160 // avoids unnecessary memory operations.
161 3648 void scalar_clear_histogram() {
162 3648 memset(fine, 0, sizeof(fine[0]) * 256);
163 3648 memset(coarse, 0, sizeof(coarse[0]) * 16);
164 3648 }
165
166 368 void vector_clear_histogram() {
167 368 memset(fine, 0, sizeof(uint8_t) * 4096);
168 368 memset(coarse, 0, sizeof(uint8_t) * 256);
169 368 }
170
171 // Before the main vertical loop over 'height', the histogram must be
172 // initialized for each new 'width'. This is done using either
173 // scalar_initialize_histogram or vector_initialize_histogram depending on the
174 // processing mode. These functions preload the histogram using rows from the
175 // source image to enable efficient sliding window updates during vertical
176 // traversal.
177 502848 void scalar_initialize_histogram(uint8_t incoming_pixel) {
178 502848 fine[incoming_pixel]++;
179 502848 coarse[incoming_pixel >> 4]++;
180 502848 }
181
182 52848 void vector_initialize_histogram(uint8x16_t& incoming_pixels) {
183 KLEIDICV_FORCE_LOOP_UNROLL
184
2/2
✓ Branch 0 taken 845568 times.
✓ Branch 1 taken 52848 times.
898416 for (int i = 0; i < 16; i++) {
185 845568 fine[incoming_pixels[i] * 16 + i]++;
186 845568 }
187
188 52848 incoming_pixels = vshrq_n_u8(incoming_pixels, 4);
189
190 52848 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
191 52848 vec_coarse[0] = vsubq(vec_coarse[0], vceqzq_u8(incoming_pixels));
192 KLEIDICV_FORCE_LOOP_UNROLL
193
2/2
✓ Branch 0 taken 52848 times.
✓ Branch 1 taken 792720 times.
845568 for (int i = 1; i < 16; i++) {
194 792720 uint8x16_t index = vdupq_n_u8(i);
195 792720 vec_coarse[i] = vsubq(vec_coarse[i], vceqq(incoming_pixels, index));
196 792720 }
197 52848 }
198
199 // During vertical traversal (the main 'height' loop), each sliding window
200 // iteration introduces a new incoming row and removes an outgoing one. The
201 // histogram must be updated accordingly by subtracting the contributions of
202 // the outgoing row and adding those of the incoming row.
203 // In many cases, incoming and outgoing pixels may be equal, so we perform a
204 // conditional check to avoid unnecessary updates.
205 // Both increment and decrement operations are handled inside the same
206 // function (scalar_update_histogram / vector_update_histogram) for
207 // efficiency.
208 728064 void scalar_update_histogram(uint8_t& incoming_pixel,
209 uint8_t& outgoing_pixel) {
210
2/2
✓ Branch 0 taken 3196 times.
✓ Branch 1 taken 724868 times.
728064 if (incoming_pixel != outgoing_pixel) {
211 724868 fine[incoming_pixel]++;
212 724868 coarse[incoming_pixel >> 4]++;
213 724868 fine[outgoing_pixel]--;
214 724868 coarse[outgoing_pixel >> 4]--;
215 724868 }
216 728064 }
217
218 77424 void vector_update_histogram(uint8x16_t& incoming_pixels,
219 uint8x16_t& outgoing_pixels) {
220 KLEIDICV_FORCE_LOOP_UNROLL
221
2/2
✓ Branch 0 taken 1238784 times.
✓ Branch 1 taken 77424 times.
1316208 for (int i = 0; i < 16; i++) {
222 1238784 fine[incoming_pixels[i] * 16 + i]++;
223 1238784 fine[outgoing_pixels[i] * 16 + i]--;
224 1238784 }
225
226 77424 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
227 77424 incoming_pixels = vshrq_n<4>(incoming_pixels);
228 77424 outgoing_pixels = vshrq_n<4>(outgoing_pixels);
229
230 154848 uint8x16_t delta =
231 77424 vsubq(vceqzq_u8(outgoing_pixels), vceqzq_u8(incoming_pixels));
232 77424 vec_coarse[0] = vaddq(vec_coarse[0], delta);
233
234 KLEIDICV_FORCE_LOOP_UNROLL
235
2/2
✓ Branch 0 taken 77424 times.
✓ Branch 1 taken 1161360 times.
1238784 for (int i = 1; i < 16; i++) {
236 1161360 uint8x16_t index = vdupq_n_u8(i);
237 1161360 delta =
238 1161360 vsubq(vceqq(outgoing_pixels, index), vceqq(incoming_pixels, index));
239 1161360 vec_coarse[i] = vaddq(vec_coarse[i], delta);
240 1161360 }
241 77424 }
242
243 // To find the median efficiently, we first scan the coarse histogram to
244 // identify the segment (coarse bin) where the median value lies. This helps
245 // narrow down the search range in the fine histogram. Once the correct coarse
246 // bin is located, we scan the corresponding segment in the fine histogram
247 // until the cumulative distribution function (CDF) reaches the target CDF
248 66304 uint8_t scalar_find_median(size_t ksize) {
249 // The target median index in a sorted window
250 66304 const uint8_t target_cdf = (ksize * ksize) / 2;
251
252 // Variables for histogram scanning
253 66304 uint8_t cumulative_sum = 0;
254 66304 int fine_index = 0;
255 66304 int coarse_index = 0;
256
257 // Phase 1: Coarse histogram scan to find the correct bin range
258 567605 while (true) {
259
2/2
✓ Branch 0 taken 501301 times.
✓ Branch 1 taken 66304 times.
567605 if ((cumulative_sum + coarse[coarse_index]) > target_cdf) {
260 66304 fine_index = coarse_index * 16;
261 66304 break;
262 }
263 501301 cumulative_sum += coarse[coarse_index];
264 501301 coarse_index++;
265 }
266
267 // Phase 2: Fine histogram scan to locate the exact median value
268 546077 while (true) {
269 546077 cumulative_sum += fine[fine_index];
270
2/2
✓ Branch 0 taken 66304 times.
✓ Branch 1 taken 479773 times.
546077 if (cumulative_sum > target_cdf) {
271 66304 break;
272 }
273 479773 fine_index++;
274 }
275
276 132608 return fine_index;
277 66304 }
278
279 6944 uint8x16_t vector_find_median(size_t ksize) {
280 // Calculate the target median index based on kernel size
281 6944 const uint8x16_t target_cdf = vdupq_n_u8((ksize * ksize) / 2);
282
283 // Cumulative sum vector used for tracking the running histogram total
284 6944 uint8x16_t cumulative_sum = vdupq_n_u8(0);
285
286 // Coarse histogram pointer (used to narrow the search)
287 6944 uint8x16_t* coarse_histogram = reinterpret_cast<uint8x16_t*>(coarse);
288
289 // Coarse pass: Locate the coarse histogram bin range likely containing the
290 // median value This step identifies the starting fine histogram index for
291 // each lane, based on cumulative counts. It does not find the actual median
292 // yet.
293 6944 int coarse_index = 0;
294 6944 int fine_index = 0;
295
296 52108 while (true) {
297 104216 uint8x16_t cumulative_sum_next =
298 52108 vaddq(cumulative_sum, coarse_histogram[coarse_index]);
299 104216 uint8x16_t coarse_threshold_exceeded =
300 52108 vcgtq_u8(cumulative_sum_next, target_cdf);
301
302
2/2
✓ Branch 0 taken 6944 times.
✓ Branch 1 taken 45164 times.
52108 if (any_lane_set(coarse_threshold_exceeded)) {
303 6944 fine_index = coarse_index * 16;
304 6944 break;
305 }
306
307 45164 cumulative_sum = cumulative_sum_next;
308 45164 coarse_index++;
309 52108 }
310
311 // Fine pass: Scan the fine histogram to find the exact median per lane
312 6944 uint8x16_t median_result = vdupq_n_u8(0);
313 6944 uint8x16_t lane_found_mask = vdupq_n_u8(0);
314 6944 uint8x16_t* fine_histogram = reinterpret_cast<uint8x16_t*>(fine);
315
316 296349 while (true) {
317 296349 cumulative_sum = vaddq(cumulative_sum, fine_histogram[fine_index]);
318
319 296349 uint8x16_t still_searching_mask = vceqzq_u8(lane_found_mask);
320 296349 median_result =
321 296349 vbslq_u8(still_searching_mask, vdupq_n_u8(fine_index), median_result);
322 296349 lane_found_mask =
323 296349 vorrq_u8(lane_found_mask, vcgtq_u8(cumulative_sum, target_cdf));
324
325
2/2
✓ Branch 0 taken 6944 times.
✓ Branch 1 taken 289405 times.
296349 if (all_lane_set(lane_found_mask)) {
326 6944 break;
327 }
328 289405 fine_index++;
329 296349 }
330
331 13888 return median_result;
332 6944 }
333
334 296349 bool all_lane_set(uint8x16_t& v_u8) {
335 296349 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
336 592698 return vminvq_u32(v_u32) == 0xffffffff;
337 296349 }
338
339 52108 bool any_lane_set(uint8x16_t v_u8) {
340 52108 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
341 104216 return vmaxvq_u32(v_u32) != 0;
342 52108 }
343 };
344
345 136 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_small_hist_stripe_u8(
346 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
347 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
348 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
349 136 Rectangle image_dimensions{width, height};
350 136 Rows<const uint8_t> src_rows{src, src_stride, channels};
351 136 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
352 136 MedianBlurSmallHist median_filter;
353 136 const size_t kMargin = kernel_width / 2;
354
355 // Process left border
356 136 size_t starting_width = 0;
357 136 const size_t processing_left_width = kMargin;
358 136 Point starting_left_coordinates{starting_width, y_begin};
359 136 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
360
361 136 median_filter.process_pixels_with_horizontal_borders(
362 136 image_dimensions, starting_left_coordinates, ending_left_coordinates,
363 136 src_rows, dst_rows, kernel_height, border_type);
364
365 // Process center region
366 136 starting_width = processing_left_width;
367 // Compute the width of the center region that can be processed with NEON
368 // instructions. Subtract 2 * kMargin to exclude left and right borders, which
369 // are handled separately using scalar code due to varying border modes (e.g.,
370 // REPLICATE, REFLECT, WRAP, REVERSE). Align the remaining width down to the
371 // nearest multiple of 16 to match NEON's 128-bit register width (16 bytes for
372 // uint8x16_t).
373 136 const size_t processing_center_width = ((width - 2 * kMargin) / 16) * 16;
374 136 Point starting_center_coordinates{starting_width * channels, y_begin};
375 272 Point ending_center_coordinates{
376 136 (processing_center_width + starting_width) * channels, y_end};
377
378 136 median_filter.process_pixels_without_horizontal_borders(
379 136 image_dimensions, starting_center_coordinates, ending_center_coordinates,
380 136 src_rows, dst_rows, kernel_height, border_type);
381
382 // Process right border
383 136 starting_width = processing_left_width + processing_center_width;
384 272 const size_t processing_right_width =
385 136 width - processing_left_width - processing_center_width;
386 136 Point starting_right_coordinates{starting_width, y_begin};
387 272 Point ending_right_coordinates{starting_width + processing_right_width,
388 136 y_end};
389
390 136 median_filter.process_pixels_with_horizontal_borders(
391 136 image_dimensions, starting_right_coordinates, ending_right_coordinates,
392 136 src_rows, dst_rows, kernel_height, border_type);
393
394 136 return KLEIDICV_OK;
395 136 }
396 } // namespace kleidicv::neon
397