KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_small_hist_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 221 221 100.0%
Functions: 14 14 100.0%
Branches: 40 40 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 // B. Weiss, "Fast Median and Bilateral Filtering," in *ACM SIGGRAPH 2006
14 // Papers*, ACM, New York, NY, USA, pp. 519–526, 2006.
15 // The paper is currently available at:
16 // http://mesh.brown.edu/engn1610/refs/Weiss-siggraph2006.pdf
17 class MedianBlurSmallHist {
18 public:
19 75 MedianBlurSmallHist() : fine{}, coarse{} {}
20
21 150 void process_pixels_with_horizontal_borders(
22 Rectangle image_dimensions, Point starting_coordinates,
23 Point ending_coordinates, Rows<const uint8_t> src_rows,
24 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
25 150 const size_t kMargin = ksize / 2;
26
27
2/2
✓ Branch 0 taken 150 times.
✓ Branch 1 taken 1152 times.
1302 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w++) {
28
2/2
✓ Branch 0 taken 2448 times.
✓ Branch 1 taken 1152 times.
3600 for (ptrdiff_t ch = 0; ch < static_cast<ptrdiff_t>(src_rows.channels());
29 2448 ch++) {
30 2448 scalar_clear_histogram();
31
32 // We initialize with ksize rows to allow merging of
33 // histogram increment and decrement operations in the main loop.
34 // This extra initial load enables a single update phase and avoids
35 // splitting the logic into separate steps.
36
2/2
✓ Branch 0 taken 28512 times.
✓ Branch 1 taken 2448 times.
30960 for (size_t r = 0; r < ksize; r++) {
37
2/2
✓ Branch 0 taken 353808 times.
✓ Branch 1 taken 28512 times.
382320 for (size_t c = 0; c < ksize; c++) {
38 707616 const ptrdiff_t valid_h =
39 707616 get_physical_index(starting_coordinates.y() + r - kMargin,
40 353808 image_dimensions.height(), border_type);
41 707616 const ptrdiff_t valid_w = get_physical_index(
42 353808 w + c - kMargin, image_dimensions.width(), border_type);
43
44 353808 uint8_t pixel = src_rows.at(valid_h, valid_w)[ch];
45
46 353808 scalar_initialize_histogram(pixel);
47 353808 }
48 28512 }
49
50 2448 const uint8_t median_value = scalar_find_median(ksize);
51
52 9792 dst_rows.at(static_cast<ptrdiff_t>(starting_coordinates.y()),
53 9792 static_cast<ptrdiff_t>(w))[ch] = median_value;
54
55
2/2
✓ Branch 0 taken 47280 times.
✓ Branch 1 taken 2448 times.
49728 for (size_t h = starting_coordinates.y() + 1;
56 49728 h < ending_coordinates.y(); h++) {
57 94560 const ptrdiff_t valid_new_h = get_physical_index(
58 47280 h + kMargin, image_dimensions.height(), border_type);
59
60 94560 const ptrdiff_t valid_old_h = get_physical_index(
61 47280 h - kMargin - 1, image_dimensions.height(), border_type);
62
63
2/2
✓ Branch 0 taken 548640 times.
✓ Branch 1 taken 47280 times.
595920 for (size_t c = 0; c < ksize; c++) {
64 1097280 const ptrdiff_t valid_w = get_physical_index(
65 548640 w + c - kMargin, image_dimensions.width(), border_type);
66
67 548640 uint8_t incoming_pixel = src_rows.at(valid_new_h, valid_w)[ch];
68
69 548640 uint8_t outgoing_pixel = src_rows.at(valid_old_h, valid_w)[ch];
70
71 548640 scalar_update_histogram(incoming_pixel, outgoing_pixel);
72 548640 }
73
74 47280 const uint8_t median_value = scalar_find_median(ksize);
75
76 189120 dst_rows.at(static_cast<ptrdiff_t>(h),
77 189120 static_cast<ptrdiff_t>(w))[ch] = median_value;
78 47280 }
79 2448 }
80 1152 }
81 150 }
82
83 75 void process_pixels_without_horizontal_borders(
84 Rectangle image_dimensions, Point starting_coordinates,
85 Point ending_coordinates, Rows<const uint8_t> src_rows,
86 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
87 75 const size_t step = sizeof(uint8x16_t);
88 75 const size_t KMargin_w = (ksize / 2) * src_rows.channels();
89 75 const size_t KMargin_h = (ksize / 2);
90
91
2/2
✓ Branch 0 taken 75 times.
✓ Branch 1 taken 258 times.
333 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
92 258 w += step) {
93 258 vector_clear_histogram();
94
95 // We initialize with ksize rows to allow merging of
96 // histogram increment and decrement operations in the main loop.
97 // This extra initial load enables a single update phase and avoids
98 // splitting the logic into separate steps.
99
2/2
✓ Branch 0 taken 3042 times.
✓ Branch 1 taken 258 times.
3300 for (size_t r = 0; r < ksize; r++) {
100 6084 const ptrdiff_t vertical_index =
101 6084 get_physical_index(starting_coordinates.y() + r - KMargin_h,
102 3042 image_dimensions.height(), border_type);
103
104
2/2
✓ Branch 0 taken 38178 times.
✓ Branch 1 taken 3042 times.
41220 for (size_t c = 0; c < ksize; c++) {
105 76356 const size_t horizontal_index =
106 38178 w + c * src_rows.channels() - KMargin_w;
107
108 38178 uint8x16_t pixel = vld1q_u8(
109 &src_rows[vertical_index * src_rows.stride() + horizontal_index]);
110
111 38178 vector_initialize_histogram(pixel);
112 38178 }
113 3042 }
114
115 258 const uint8x16_t median_value = vector_find_median(ksize);
116
117 258 vst1q_u8(&dst_rows[starting_coordinates.y() * dst_rows.stride() + w],
118 median_value);
119
120
2/2
✓ Branch 0 taken 4950 times.
✓ Branch 1 taken 258 times.
5208 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
121 4950 ++h) {
122 9900 const ptrdiff_t vertical_index_new = get_physical_index(
123 4950 h + KMargin_h, image_dimensions.height(), border_type);
124
125 9900 const ptrdiff_t vertical_index_old = get_physical_index(
126 4950 h - KMargin_h - 1, image_dimensions.height(), border_type);
127
128
2/2
✓ Branch 0 taken 58230 times.
✓ Branch 1 taken 4950 times.
63180 for (size_t c = 0; c < ksize; c++) {
129 58230 size_t horizontal_index = w + c * src_rows.channels() - KMargin_w;
130
131 116460 uint8x16_t incoming_pixels =
132 58230 vld1q_u8(&src_rows[vertical_index_new * src_rows.stride() +
133 horizontal_index]);
134
135 116460 uint8x16_t outgoing_pixels =
136 58230 vld1q_u8(&src_rows[vertical_index_old * src_rows.stride() +
137 horizontal_index]);
138
139 58230 vector_update_histogram(incoming_pixels, outgoing_pixels);
140 58230 }
141
142 4950 const uint8x16_t median_value = vector_find_median(ksize);
143
144 4950 vst1q_u8(&dst_rows[h * dst_rows.stride() + w], median_value);
145 4950 }
146 258 }
147 75 }
148
149 private:
150 // The 'fine' and 'coarse' histograms are shared between both scalar and
151 // vector operations. Their buffer sizes are allocated based on the
152 // vectorized case to ensure compatibility and avoid reallocation.
153 // In case of vectorized execution, 'fine' and 'coarse' are actually
154 // implemented as 16 interleaved histograms, one per vector lane.
155 uint8_t fine[4096];
156 uint8_t coarse[256];
157
158 // In scalar_clear_histogram, we only clear the relevant portions of the
159 // 'fine' and 'coarse' buffers that are actually used during computation. This
160 // avoids unnecessary memory operations.
161 2448 void scalar_clear_histogram() {
162 2448 memset(fine, 0, sizeof(fine[0]) * 256);
163 2448 memset(coarse, 0, sizeof(coarse[0]) * 16);
164 2448 }
165
166 258 void vector_clear_histogram() {
167 258 memset(fine, 0, sizeof(uint8_t) * 4096);
168 258 memset(coarse, 0, sizeof(uint8_t) * 256);
169 258 }
170
171 // Before the main vertical loop over 'height', the histogram must be
172 // initialized for each new 'width'. This is done using either
173 // scalar_initialize_histogram or vector_initialize_histogram depending on the
174 // processing mode. These functions preload the histogram using rows from the
175 // source image to enable efficient sliding window updates during vertical
176 // traversal.
177 353808 void scalar_initialize_histogram(uint8_t incoming_pixel) {
178 353808 fine[incoming_pixel]++;
179 353808 coarse[incoming_pixel >> 4]++;
180 353808 }
181
182 38178 void vector_initialize_histogram(uint8x16_t& incoming_pixels) {
183 KLEIDICV_FORCE_LOOP_UNROLL
184
2/2
✓ Branch 0 taken 610848 times.
✓ Branch 1 taken 38178 times.
649026 for (int i = 0; i < 16; i++) {
185 610848 fine[incoming_pixels[i] * 16 + i]++;
186 610848 }
187
188 38178 incoming_pixels = vshrq_n_u8(incoming_pixels, 4);
189
190 38178 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
191 38178 vec_coarse[0] = vsubq(vec_coarse[0], vceqzq_u8(incoming_pixels));
192 KLEIDICV_FORCE_LOOP_UNROLL
193
2/2
✓ Branch 0 taken 38178 times.
✓ Branch 1 taken 572670 times.
610848 for (int i = 1; i < 16; i++) {
194 572670 uint8x16_t index = vdupq_n_u8(i);
195 572670 vec_coarse[i] = vsubq(vec_coarse[i], vceqq(incoming_pixels, index));
196 572670 }
197 38178 }
198
199 // During vertical traversal (the main 'height' loop), each sliding window
200 // iteration introduces a new incoming row and removes an outgoing one. The
201 // histogram must be updated accordingly by subtracting the contributions of
202 // the outgoing row and adding those of the incoming row.
203 // In many cases, incoming and outgoing pixels may be equal, so we perform a
204 // conditional check to avoid unnecessary updates.
205 // Both increment and decrement operations are handled inside the same
206 // function (scalar_update_histogram / vector_update_histogram) for
207 // efficiency.
208 548640 void scalar_update_histogram(uint8_t& incoming_pixel,
209 uint8_t& outgoing_pixel) {
210
2/2
✓ Branch 0 taken 2170 times.
✓ Branch 1 taken 546470 times.
548640 if (incoming_pixel != outgoing_pixel) {
211 546470 fine[incoming_pixel]++;
212 546470 coarse[incoming_pixel >> 4]++;
213 546470 fine[outgoing_pixel]--;
214 546470 coarse[outgoing_pixel >> 4]--;
215 546470 }
216 548640 }
217
218 58230 void vector_update_histogram(uint8x16_t& incoming_pixels,
219 uint8x16_t& outgoing_pixels) {
220 KLEIDICV_FORCE_LOOP_UNROLL
221
2/2
✓ Branch 0 taken 931680 times.
✓ Branch 1 taken 58230 times.
989910 for (int i = 0; i < 16; i++) {
222 931680 fine[incoming_pixels[i] * 16 + i]++;
223 931680 fine[outgoing_pixels[i] * 16 + i]--;
224 931680 }
225
226 58230 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
227 58230 incoming_pixels = vshrq_n<4>(incoming_pixels);
228 58230 outgoing_pixels = vshrq_n<4>(outgoing_pixels);
229
230 116460 uint8x16_t delta =
231 58230 vsubq(vceqzq_u8(outgoing_pixels), vceqzq_u8(incoming_pixels));
232 58230 vec_coarse[0] = vaddq(vec_coarse[0], delta);
233
234 KLEIDICV_FORCE_LOOP_UNROLL
235
2/2
✓ Branch 0 taken 58230 times.
✓ Branch 1 taken 873450 times.
931680 for (int i = 1; i < 16; i++) {
236 873450 uint8x16_t index = vdupq_n_u8(i);
237 873450 delta =
238 873450 vsubq(vceqq(outgoing_pixels, index), vceqq(incoming_pixels, index));
239 873450 vec_coarse[i] = vaddq(vec_coarse[i], delta);
240 873450 }
241 58230 }
242
243 // To find the median efficiently, we first scan the coarse histogram to
244 // identify the segment (coarse bin) where the median value lies. This helps
245 // narrow down the search range in the fine histogram. Once the correct coarse
246 // bin is located, we scan the corresponding segment in the fine histogram
247 // until the cumulative distribution function (CDF) reaches the target CDF
248 49728 uint8_t scalar_find_median(size_t ksize) {
249 // The target median index in a sorted window
250 49728 const uint8_t target_cdf = (ksize * ksize) / 2;
251
252 // Variables for histogram scanning
253 49728 uint8_t cumulative_sum = 0;
254 49728 int fine_index = 0;
255 49728 int coarse_index = 0;
256
257 // Phase 1: Coarse histogram scan to find the correct bin range
258 419703 while (true) {
259
2/2
✓ Branch 0 taken 369975 times.
✓ Branch 1 taken 49728 times.
419703 if ((cumulative_sum + coarse[coarse_index]) > target_cdf) {
260 49728 fine_index = coarse_index * 16;
261 49728 break;
262 }
263 369975 cumulative_sum += coarse[coarse_index];
264 369975 coarse_index++;
265 }
266
267 // Phase 2: Fine histogram scan to locate the exact median value
268 433065 while (true) {
269 433065 cumulative_sum += fine[fine_index];
270
2/2
✓ Branch 0 taken 49728 times.
✓ Branch 1 taken 383337 times.
433065 if (cumulative_sum > target_cdf) {
271 49728 break;
272 }
273 383337 fine_index++;
274 }
275
276 99456 return fine_index;
277 49728 }
278
279 5208 uint8x16_t vector_find_median(size_t ksize) {
280 // Calculate the target median index based on kernel size
281 5208 const uint8x16_t target_cdf = vdupq_n_u8((ksize * ksize) / 2);
282
283 // Cumulative sum vector used for tracking the running histogram total
284 5208 uint8x16_t cumulative_sum = vdupq_n_u8(0);
285
286 // Coarse histogram pointer (used to narrow the search)
287 5208 uint8x16_t* coarse_histogram = reinterpret_cast<uint8x16_t*>(coarse);
288
289 // Coarse pass: Locate the coarse histogram bin range likely containing the
290 // median value This step identifies the starting fine histogram index for
291 // each lane, based on cumulative counts. It does not find the actual median
292 // yet.
293 5208 int coarse_index = 0;
294 5208 int fine_index = 0;
295
296 39290 while (true) {
297 78580 uint8x16_t cumulative_sum_next =
298 39290 vaddq(cumulative_sum, coarse_histogram[coarse_index]);
299 78580 uint8x16_t coarse_threshold_exceeded =
300 39290 vcgtq_u8(cumulative_sum_next, target_cdf);
301
302
2/2
✓ Branch 0 taken 5208 times.
✓ Branch 1 taken 34082 times.
39290 if (any_lane_set(coarse_threshold_exceeded)) {
303 5208 fine_index = coarse_index * 16;
304 5208 break;
305 }
306
307 34082 cumulative_sum = cumulative_sum_next;
308 34082 coarse_index++;
309 39290 }
310
311 // Fine pass: Scan the fine histogram to find the exact median per lane
312 5208 uint8x16_t median_result = vdupq_n_u8(0);
313 5208 uint8x16_t lane_found_mask = vdupq_n_u8(0);
314 5208 uint8x16_t* fine_histogram = reinterpret_cast<uint8x16_t*>(fine);
315
316 214049 while (true) {
317 214049 cumulative_sum = vaddq(cumulative_sum, fine_histogram[fine_index]);
318
319 214049 uint8x16_t still_searching_mask = vceqzq_u8(lane_found_mask);
320 214049 median_result =
321 214049 vbslq_u8(still_searching_mask, vdupq_n_u8(fine_index), median_result);
322 214049 lane_found_mask =
323 214049 vorrq_u8(lane_found_mask, vcgtq_u8(cumulative_sum, target_cdf));
324
325
2/2
✓ Branch 0 taken 5208 times.
✓ Branch 1 taken 208841 times.
214049 if (all_lane_set(lane_found_mask)) {
326 5208 break;
327 }
328 208841 fine_index++;
329 214049 }
330
331 10416 return median_result;
332 5208 }
333
334 214049 bool all_lane_set(uint8x16_t& v_u8) {
335 214049 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
336 428098 return vminvq_u32(v_u32) == 0xffffffff;
337 214049 }
338
339 39290 bool any_lane_set(uint8x16_t v_u8) {
340 39290 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
341 78580 return vmaxvq_u32(v_u32) != 0;
342 39290 }
343 };
344
345 75 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_small_hist_stripe_u8(
346 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
347 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
348 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
349 75 Rectangle image_dimensions{width, height};
350 75 Rows<const uint8_t> src_rows{src, src_stride, channels};
351 75 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
352 75 MedianBlurSmallHist median_filter;
353 75 const size_t kMargin = kernel_width / 2;
354
355 // Process left border
356 75 size_t starting_width = 0;
357 75 const size_t processing_left_width = kMargin;
358 75 Point starting_left_coordinates{starting_width, y_begin};
359 75 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
360
361 75 median_filter.process_pixels_with_horizontal_borders(
362 75 image_dimensions, starting_left_coordinates, ending_left_coordinates,
363 75 src_rows, dst_rows, kernel_height, border_type);
364
365 // Process center region
366 75 starting_width = processing_left_width;
367 // Compute the width of the center region that can be processed with NEON
368 // instructions. Subtract 2 * kMargin to exclude left and right borders, which
369 // are handled separately using scalar code due to varying border modes (e.g.,
370 // REPLICATE, REFLECT, WRAP, REVERSE). Align the remaining width down to the
371 // nearest multiple of 16 to match NEON's 128-bit register width (16 bytes for
372 // uint8x16_t).
373 75 const size_t processing_center_width = ((width - 2 * kMargin) / 16) * 16;
374 75 Point starting_center_coordinates{starting_width * channels, y_begin};
375 150 Point ending_center_coordinates{
376 75 (processing_center_width + starting_width) * channels, y_end};
377
378 75 median_filter.process_pixels_without_horizontal_borders(
379 75 image_dimensions, starting_center_coordinates, ending_center_coordinates,
380 75 src_rows, dst_rows, kernel_height, border_type);
381
382 // Process right border
383 75 starting_width = processing_left_width + processing_center_width;
384 150 const size_t processing_right_width =
385 75 width - processing_left_width - processing_center_width;
386 75 Point starting_right_coordinates{starting_width, y_begin};
387 150 Point ending_right_coordinates{starting_width + processing_right_width,
388 75 y_end};
389
390 75 median_filter.process_pixels_with_horizontal_borders(
391 75 image_dimensions, starting_right_coordinates, ending_right_coordinates,
392 75 src_rows, dst_rows, kernel_height, border_type);
393
394 75 return KLEIDICV_OK;
395 75 }
396 } // namespace kleidicv::neon
397