KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/median_blur_small_hist_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 221 221 100.0%
Functions: 14 14 100.0%
Branches: 40 40 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/median_blur.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/neon.h"
9 #include "median_blur_border_handling.h"
10
11 namespace kleidicv::neon {
12
13 // B. Weiss, "Fast Median and Bilateral Filtering," in *ACM SIGGRAPH 2006
14 // Papers*, ACM, New York, NY, USA, pp. 519–526, 2006.
15 // The paper is currently available at:
16 // http://mesh.brown.edu/engn1610/refs/Weiss-siggraph2006.pdf
17 class MedianBlurSmallHist {
18 public:
19 100 MedianBlurSmallHist() : fine{}, coarse{} {}
20
21 200 void process_pixels_with_horizontal_borders(
22 Rectangle image_dimensions, Point starting_coordinates,
23 Point ending_coordinates, Rows<const uint8_t> src_rows,
24 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
25 200 const size_t kMargin = ksize / 2;
26
27
2/2
✓ Branch 0 taken 200 times.
✓ Branch 1 taken 1536 times.
1736 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x(); w++) {
28
2/2
✓ Branch 0 taken 3264 times.
✓ Branch 1 taken 1536 times.
4800 for (ptrdiff_t ch = 0; ch < static_cast<ptrdiff_t>(src_rows.channels());
29 3264 ch++) {
30 3264 scalar_clear_histogram();
31
32 // We initialize with ksize rows to allow merging of
33 // histogram increment and decrement operations in the main loop.
34 // This extra initial load enables a single update phase and avoids
35 // splitting the logic into separate steps.
36
2/2
✓ Branch 0 taken 38016 times.
✓ Branch 1 taken 3264 times.
41280 for (size_t r = 0; r < ksize; r++) {
37
2/2
✓ Branch 0 taken 471744 times.
✓ Branch 1 taken 38016 times.
509760 for (size_t c = 0; c < ksize; c++) {
38 943488 const ptrdiff_t valid_h =
39 943488 get_physical_index(starting_coordinates.y() + r - kMargin,
40 471744 image_dimensions.height(), border_type);
41 943488 const ptrdiff_t valid_w = get_physical_index(
42 471744 w + c - kMargin, image_dimensions.width(), border_type);
43
44 471744 uint8_t pixel = src_rows.at(valid_h, valid_w)[ch];
45
46 471744 scalar_initialize_histogram(pixel);
47 471744 }
48 38016 }
49
50 3264 const uint8_t median_value = scalar_find_median(ksize);
51
52 13056 dst_rows.at(static_cast<ptrdiff_t>(starting_coordinates.y()),
53 13056 static_cast<ptrdiff_t>(w))[ch] = median_value;
54
55
2/2
✓ Branch 0 taken 63040 times.
✓ Branch 1 taken 3264 times.
66304 for (size_t h = starting_coordinates.y() + 1;
56 66304 h < ending_coordinates.y(); h++) {
57 126080 const ptrdiff_t valid_new_h = get_physical_index(
58 63040 h + kMargin, image_dimensions.height(), border_type);
59
60 126080 const ptrdiff_t valid_old_h = get_physical_index(
61 63040 h - kMargin - 1, image_dimensions.height(), border_type);
62
63
2/2
✓ Branch 0 taken 731520 times.
✓ Branch 1 taken 63040 times.
794560 for (size_t c = 0; c < ksize; c++) {
64 1463040 const ptrdiff_t valid_w = get_physical_index(
65 731520 w + c - kMargin, image_dimensions.width(), border_type);
66
67 731520 uint8_t incoming_pixel = src_rows.at(valid_new_h, valid_w)[ch];
68
69 731520 uint8_t outgoing_pixel = src_rows.at(valid_old_h, valid_w)[ch];
70
71 731520 scalar_update_histogram(incoming_pixel, outgoing_pixel);
72 731520 }
73
74 63040 const uint8_t median_value = scalar_find_median(ksize);
75
76 252160 dst_rows.at(static_cast<ptrdiff_t>(h),
77 252160 static_cast<ptrdiff_t>(w))[ch] = median_value;
78 63040 }
79 3264 }
80 1536 }
81 200 }
82
83 100 void process_pixels_without_horizontal_borders(
84 Rectangle image_dimensions, Point starting_coordinates,
85 Point ending_coordinates, Rows<const uint8_t> src_rows,
86 Rows<uint8_t> dst_rows, size_t ksize, FixedBorderType border_type) {
87 100 const size_t step = sizeof(uint8x16_t);
88 100 const size_t KMargin_w = (ksize / 2) * src_rows.channels();
89 100 const size_t KMargin_h = (ksize / 2);
90
91
2/2
✓ Branch 0 taken 100 times.
✓ Branch 1 taken 344 times.
444 for (size_t w = starting_coordinates.x(); w < ending_coordinates.x();
92 344 w += step) {
93 344 vector_clear_histogram();
94
95 // We initialize with ksize rows to allow merging of
96 // histogram increment and decrement operations in the main loop.
97 // This extra initial load enables a single update phase and avoids
98 // splitting the logic into separate steps.
99
2/2
✓ Branch 0 taken 4056 times.
✓ Branch 1 taken 344 times.
4400 for (size_t r = 0; r < ksize; r++) {
100 8112 const ptrdiff_t vertical_index =
101 8112 get_physical_index(starting_coordinates.y() + r - KMargin_h,
102 4056 image_dimensions.height(), border_type);
103
104
2/2
✓ Branch 0 taken 50904 times.
✓ Branch 1 taken 4056 times.
54960 for (size_t c = 0; c < ksize; c++) {
105 101808 const size_t horizontal_index =
106 50904 w + c * src_rows.channels() - KMargin_w;
107
108 50904 uint8x16_t pixel = vld1q_u8(
109 &src_rows[vertical_index * src_rows.stride() + horizontal_index]);
110
111 50904 vector_initialize_histogram(pixel);
112 50904 }
113 4056 }
114
115 344 const uint8x16_t median_value = vector_find_median(ksize);
116
117 344 vst1q_u8(&dst_rows[starting_coordinates.y() * dst_rows.stride() + w],
118 median_value);
119
120
2/2
✓ Branch 0 taken 6600 times.
✓ Branch 1 taken 344 times.
6944 for (size_t h = starting_coordinates.y() + 1; h < ending_coordinates.y();
121 6600 ++h) {
122 13200 const ptrdiff_t vertical_index_new = get_physical_index(
123 6600 h + KMargin_h, image_dimensions.height(), border_type);
124
125 13200 const ptrdiff_t vertical_index_old = get_physical_index(
126 6600 h - KMargin_h - 1, image_dimensions.height(), border_type);
127
128
2/2
✓ Branch 0 taken 77640 times.
✓ Branch 1 taken 6600 times.
84240 for (size_t c = 0; c < ksize; c++) {
129 77640 size_t horizontal_index = w + c * src_rows.channels() - KMargin_w;
130
131 155280 uint8x16_t incoming_pixels =
132 77640 vld1q_u8(&src_rows[vertical_index_new * src_rows.stride() +
133 horizontal_index]);
134
135 155280 uint8x16_t outgoing_pixels =
136 77640 vld1q_u8(&src_rows[vertical_index_old * src_rows.stride() +
137 horizontal_index]);
138
139 77640 vector_update_histogram(incoming_pixels, outgoing_pixels);
140 77640 }
141
142 6600 const uint8x16_t median_value = vector_find_median(ksize);
143
144 6600 vst1q_u8(&dst_rows[h * dst_rows.stride() + w], median_value);
145 6600 }
146 344 }
147 100 }
148
149 private:
150 // The 'fine' and 'coarse' histograms are shared between both scalar and
151 // vector operations. Their buffer sizes are allocated based on the
152 // vectorized case to ensure compatibility and avoid reallocation.
153 // In case of vectorized execution, 'fine' and 'coarse' are actually
154 // implemented as 16 interleaved histograms, one per vector lane.
155 uint8_t fine[4096];
156 uint8_t coarse[256];
157
158 // In scalar_clear_histogram, we only clear the relevant portions of the
159 // 'fine' and 'coarse' buffers that are actually used during computation. This
160 // avoids unnecessary memory operations.
161 3264 void scalar_clear_histogram() {
162 3264 memset(fine, 0, sizeof(fine[0]) * 256);
163 3264 memset(coarse, 0, sizeof(coarse[0]) * 16);
164 3264 }
165
166 344 void vector_clear_histogram() {
167 344 memset(fine, 0, sizeof(uint8_t) * 4096);
168 344 memset(coarse, 0, sizeof(uint8_t) * 256);
169 344 }
170
171 // Before the main vertical loop over 'height', the histogram must be
172 // initialized for each new 'width'. This is done using either
173 // scalar_initialize_histogram or vector_initialize_histogram depending on the
174 // processing mode. These functions preload the histogram using rows from the
175 // source image to enable efficient sliding window updates during vertical
176 // traversal.
177 471744 void scalar_initialize_histogram(uint8_t incoming_pixel) {
178 471744 fine[incoming_pixel]++;
179 471744 coarse[incoming_pixel >> 4]++;
180 471744 }
181
182 50904 void vector_initialize_histogram(uint8x16_t& incoming_pixels) {
183 KLEIDICV_FORCE_LOOP_UNROLL
184
2/2
✓ Branch 0 taken 814464 times.
✓ Branch 1 taken 50904 times.
865368 for (int i = 0; i < 16; i++) {
185 814464 fine[incoming_pixels[i] * 16 + i]++;
186 814464 }
187
188 50904 incoming_pixels = vshrq_n_u8(incoming_pixels, 4);
189
190 50904 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
191 50904 vec_coarse[0] = vsubq(vec_coarse[0], vceqzq_u8(incoming_pixels));
192 KLEIDICV_FORCE_LOOP_UNROLL
193
2/2
✓ Branch 0 taken 50904 times.
✓ Branch 1 taken 763560 times.
814464 for (int i = 1; i < 16; i++) {
194 763560 uint8x16_t index = vdupq_n_u8(i);
195 763560 vec_coarse[i] = vsubq(vec_coarse[i], vceqq(incoming_pixels, index));
196 763560 }
197 50904 }
198
199 // During vertical traversal (the main 'height' loop), each sliding window
200 // iteration introduces a new incoming row and removes an outgoing one. The
201 // histogram must be updated accordingly by subtracting the contributions of
202 // the outgoing row and adding those of the incoming row.
203 // In many cases, incoming and outgoing pixels may be equal, so we perform a
204 // conditional check to avoid unnecessary updates.
205 // Both increment and decrement operations are handled inside the same
206 // function (scalar_update_histogram / vector_update_histogram) for
207 // efficiency.
208 731520 void scalar_update_histogram(uint8_t& incoming_pixel,
209 uint8_t& outgoing_pixel) {
210
2/2
✓ Branch 0 taken 2649 times.
✓ Branch 1 taken 728871 times.
731520 if (incoming_pixel != outgoing_pixel) {
211 728871 fine[incoming_pixel]++;
212 728871 coarse[incoming_pixel >> 4]++;
213 728871 fine[outgoing_pixel]--;
214 728871 coarse[outgoing_pixel >> 4]--;
215 728871 }
216 731520 }
217
218 77640 void vector_update_histogram(uint8x16_t& incoming_pixels,
219 uint8x16_t& outgoing_pixels) {
220 KLEIDICV_FORCE_LOOP_UNROLL
221
2/2
✓ Branch 0 taken 1242240 times.
✓ Branch 1 taken 77640 times.
1319880 for (int i = 0; i < 16; i++) {
222 1242240 fine[incoming_pixels[i] * 16 + i]++;
223 1242240 fine[outgoing_pixels[i] * 16 + i]--;
224 1242240 }
225
226 77640 uint8x16_t* vec_coarse = reinterpret_cast<uint8x16_t*>(coarse);
227 77640 incoming_pixels = vshrq_n<4>(incoming_pixels);
228 77640 outgoing_pixels = vshrq_n<4>(outgoing_pixels);
229
230 155280 uint8x16_t delta =
231 77640 vsubq(vceqzq_u8(outgoing_pixels), vceqzq_u8(incoming_pixels));
232 77640 vec_coarse[0] = vaddq(vec_coarse[0], delta);
233
234 KLEIDICV_FORCE_LOOP_UNROLL
235
2/2
✓ Branch 0 taken 77640 times.
✓ Branch 1 taken 1164600 times.
1242240 for (int i = 1; i < 16; i++) {
236 1164600 uint8x16_t index = vdupq_n_u8(i);
237 1164600 delta =
238 1164600 vsubq(vceqq(outgoing_pixels, index), vceqq(incoming_pixels, index));
239 1164600 vec_coarse[i] = vaddq(vec_coarse[i], delta);
240 1164600 }
241 77640 }
242
243 // To find the median efficiently, we first scan the coarse histogram to
244 // identify the segment (coarse bin) where the median value lies. This helps
245 // narrow down the search range in the fine histogram. Once the correct coarse
246 // bin is located, we scan the corresponding segment in the fine histogram
247 // until the cumulative distribution function (CDF) reaches the target CDF
248 66304 uint8_t scalar_find_median(size_t ksize) {
249 // The target median index in a sorted window
250 66304 const uint8_t target_cdf = (ksize * ksize) / 2;
251
252 // Variables for histogram scanning
253 66304 uint8_t cumulative_sum = 0;
254 66304 int fine_index = 0;
255 66304 int coarse_index = 0;
256
257 // Phase 1: Coarse histogram scan to find the correct bin range
258 564626 while (true) {
259
2/2
✓ Branch 0 taken 498322 times.
✓ Branch 1 taken 66304 times.
564626 if ((cumulative_sum + coarse[coarse_index]) > target_cdf) {
260 66304 fine_index = coarse_index * 16;
261 66304 break;
262 }
263 498322 cumulative_sum += coarse[coarse_index];
264 498322 coarse_index++;
265 }
266
267 // Phase 2: Fine histogram scan to locate the exact median value
268 575117 while (true) {
269 575117 cumulative_sum += fine[fine_index];
270
2/2
✓ Branch 0 taken 66304 times.
✓ Branch 1 taken 508813 times.
575117 if (cumulative_sum > target_cdf) {
271 66304 break;
272 }
273 508813 fine_index++;
274 }
275
276 132608 return fine_index;
277 66304 }
278
279 6944 uint8x16_t vector_find_median(size_t ksize) {
280 // Calculate the target median index based on kernel size
281 6944 const uint8x16_t target_cdf = vdupq_n_u8((ksize * ksize) / 2);
282
283 // Cumulative sum vector used for tracking the running histogram total
284 6944 uint8x16_t cumulative_sum = vdupq_n_u8(0);
285
286 // Coarse histogram pointer (used to narrow the search)
287 6944 uint8x16_t* coarse_histogram = reinterpret_cast<uint8x16_t*>(coarse);
288
289 // Coarse pass: Locate the coarse histogram bin range likely containing the
290 // median value This step identifies the starting fine histogram index for
291 // each lane, based on cumulative counts. It does not find the actual median
292 // yet.
293 6944 int coarse_index = 0;
294 6944 int fine_index = 0;
295
296 51757 while (true) {
297 103514 uint8x16_t cumulative_sum_next =
298 51757 vaddq(cumulative_sum, coarse_histogram[coarse_index]);
299 103514 uint8x16_t coarse_threshold_exceeded =
300 51757 vcgtq_u8(cumulative_sum_next, target_cdf);
301
302
2/2
✓ Branch 0 taken 6944 times.
✓ Branch 1 taken 44813 times.
51757 if (any_lane_set(coarse_threshold_exceeded)) {
303 6944 fine_index = coarse_index * 16;
304 6944 break;
305 }
306
307 44813 cumulative_sum = cumulative_sum_next;
308 44813 coarse_index++;
309 51757 }
310
311 // Fine pass: Scan the fine histogram to find the exact median per lane
312 6944 uint8x16_t median_result = vdupq_n_u8(0);
313 6944 uint8x16_t lane_found_mask = vdupq_n_u8(0);
314 6944 uint8x16_t* fine_histogram = reinterpret_cast<uint8x16_t*>(fine);
315
316 291999 while (true) {
317 291999 cumulative_sum = vaddq(cumulative_sum, fine_histogram[fine_index]);
318
319 291999 uint8x16_t still_searching_mask = vceqzq_u8(lane_found_mask);
320 291999 median_result =
321 291999 vbslq_u8(still_searching_mask, vdupq_n_u8(fine_index), median_result);
322 291999 lane_found_mask =
323 291999 vorrq_u8(lane_found_mask, vcgtq_u8(cumulative_sum, target_cdf));
324
325
2/2
✓ Branch 0 taken 6944 times.
✓ Branch 1 taken 285055 times.
291999 if (all_lane_set(lane_found_mask)) {
326 6944 break;
327 }
328 285055 fine_index++;
329 291999 }
330
331 13888 return median_result;
332 6944 }
333
334 291999 bool all_lane_set(uint8x16_t& v_u8) {
335 291999 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
336 583998 return vminvq_u32(v_u32) == 0xffffffff;
337 291999 }
338
339 51757 bool any_lane_set(uint8x16_t v_u8) {
340 51757 uint32x4_t v_u32 = vreinterpretq_u32_u8(v_u8);
341 103514 return vmaxvq_u32(v_u32) != 0;
342 51757 }
343 };
344
345 100 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t median_blur_small_hist_stripe_u8(
346 const uint8_t* src, size_t src_stride, uint8_t* dst, size_t dst_stride,
347 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
348 size_t kernel_width, size_t kernel_height, FixedBorderType border_type) {
349 100 Rectangle image_dimensions{width, height};
350 100 Rows<const uint8_t> src_rows{src, src_stride, channels};
351 100 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
352 100 MedianBlurSmallHist median_filter;
353 100 const size_t kMargin = kernel_width / 2;
354
355 // Process left border
356 100 size_t starting_width = 0;
357 100 const size_t processing_left_width = kMargin;
358 100 Point starting_left_coordinates{starting_width, y_begin};
359 100 Point ending_left_coordinates{starting_width + processing_left_width, y_end};
360
361 100 median_filter.process_pixels_with_horizontal_borders(
362 100 image_dimensions, starting_left_coordinates, ending_left_coordinates,
363 100 src_rows, dst_rows, kernel_height, border_type);
364
365 // Process center region
366 100 starting_width = processing_left_width;
367 // Compute the width of the center region that can be processed with NEON
368 // instructions. Subtract 2 * kMargin to exclude left and right borders, which
369 // are handled separately using scalar code due to varying border modes (e.g.,
370 // REPLICATE, REFLECT, WRAP, REVERSE). Align the remaining width down to the
371 // nearest multiple of 16 to match NEON's 128-bit register width (16 bytes for
372 // uint8x16_t).
373 100 const size_t processing_center_width = ((width - 2 * kMargin) / 16) * 16;
374 100 Point starting_center_coordinates{starting_width * channels, y_begin};
375 200 Point ending_center_coordinates{
376 100 (processing_center_width + starting_width) * channels, y_end};
377
378 100 median_filter.process_pixels_without_horizontal_borders(
379 100 image_dimensions, starting_center_coordinates, ending_center_coordinates,
380 100 src_rows, dst_rows, kernel_height, border_type);
381
382 // Process right border
383 100 starting_width = processing_left_width + processing_center_width;
384 200 const size_t processing_right_width =
385 100 width - processing_left_width - processing_center_width;
386 100 Point starting_right_coordinates{starting_width, y_begin};
387 200 Point ending_right_coordinates{starting_width + processing_right_width,
388 100 y_end};
389
390 100 median_filter.process_pixels_with_horizontal_borders(
391 100 image_dimensions, starting_right_coordinates, ending_right_coordinates,
392 100 src_rows, dst_rows, kernel_height, border_type);
393
394 100 return KLEIDICV_OK;
395 100 }
396 } // namespace kleidicv::neon
397