KleidiCV Coverage Report


Directory: ./
File: kleidicv_thread/src/kleidicv_thread.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 534 534 100.0%
Functions: 303 303 100.0%
Branches: 330 338 97.6%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv_thread/kleidicv_thread.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <cstdint>
10 #include <functional>
11 #include <limits>
12 #include <vector>
13
14 #include "kleidicv/arithmetics/rotate.h"
15 #include "kleidicv/arithmetics/scale.h"
16 #include "kleidicv/conversions/rgb_to_yuv.h"
17 #include "kleidicv/conversions/yuv_to_rgb.h"
18 #include "kleidicv/ctypes.h"
19 #include "kleidicv/filters/blur_and_downsample.h"
20 #include "kleidicv/filters/gaussian_blur.h"
21 #include "kleidicv/filters/median_blur.h"
22 #include "kleidicv/filters/scharr.h"
23 #include "kleidicv/filters/separable_filter_2d.h"
24 #include "kleidicv/filters/sobel.h"
25 #include "kleidicv/kleidicv.h"
26 #include "kleidicv/resize/resize_linear.h"
27 #include "kleidicv/transform/remap.h"
28 #include "kleidicv/transform/warp_perspective.h"
29
30 typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback;
31
32 32366 static kleidicv_error_t kleidicv_thread_std_function_callback(
33 unsigned task_begin, unsigned task_end, void *data) {
34 32366 auto *callback = reinterpret_cast<FunctionCallback *>(data);
35 64732 return (*callback)(task_begin, task_end);
36 32366 }
37
38 // Operations in the Neon backend have both a vector path and a scalar path.
39 // The vector path is used to process most data and the scalar path is used to
40 // process the parts of the data that don't fit into the vector width.
41 // For floating point operations in particular, the results may be very slightly
42 // different between vector and scalar paths.
43 //
44 // When using multithreading, images are divided into parts to be processed by
45 // each thread, and this could change which parts of the data end up being
46 // processed by the vector and scalar paths.
47 //
48 // If an implementation is sensitive to these very slight differences, set
49 // min_batch_size to the Neon vector length (16 bytes). That makes every batch
50 // handed to a thread a multiple of the vector width; only the final batch may
51 // be longer to reach the end of the data. No batch can be shorter than vector
52 // length because that could change behaviour for operations that try to avoid
53 // the tail loop (see the TryToAvoidTailLoop class).
54 // This technique only works if the data is longer than vector length.
55 //
56 // On the other hand, measurements showed that increasing the batch size can
57 // cause degradation of the multithreaded performance.
58 template <typename Callback>
59 15012 inline kleidicv_error_t parallel_batches(Callback callback,
60 kleidicv_thread_multithreading mt,
61 unsigned count,
62 unsigned min_batch_size = 1) {
63 15012 const unsigned task_count = std::max(1U, (count) / min_batch_size);
64 47378 FunctionCallback f = [=](unsigned task_begin, unsigned task_end) {
65 32366 unsigned begin = task_begin * min_batch_size,
66 32366 end = task_end * min_batch_size;
67
124/124
✓ Branch 0 taken 1160 times.
✓ Branch 1 taken 1000 times.
✓ Branch 2 taken 328 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 116 times.
✓ Branch 5 taken 100 times.
✓ Branch 6 taken 116 times.
✓ Branch 7 taken 100 times.
✓ Branch 8 taken 116 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 116 times.
✓ Branch 11 taken 100 times.
✓ Branch 12 taken 232 times.
✓ Branch 13 taken 200 times.
✓ Branch 14 taken 116 times.
✓ Branch 15 taken 100 times.
✓ Branch 16 taken 128 times.
✓ Branch 17 taken 112 times.
✓ Branch 18 taken 328 times.
✓ Branch 19 taken 120 times.
✓ Branch 20 taken 116 times.
✓ Branch 21 taken 100 times.
✓ Branch 22 taken 348 times.
✓ Branch 23 taken 300 times.
✓ Branch 24 taken 696 times.
✓ Branch 25 taken 600 times.
✓ Branch 26 taken 348 times.
✓ Branch 27 taken 300 times.
✓ Branch 28 taken 348 times.
✓ Branch 29 taken 300 times.
✓ Branch 30 taken 348 times.
✓ Branch 31 taken 300 times.
✓ Branch 32 taken 232 times.
✓ Branch 33 taken 200 times.
✓ Branch 34 taken 232 times.
✓ Branch 35 taken 200 times.
✓ Branch 36 taken 232 times.
✓ Branch 37 taken 200 times.
✓ Branch 38 taken 116 times.
✓ Branch 39 taken 100 times.
✓ Branch 40 taken 116 times.
✓ Branch 41 taken 100 times.
✓ Branch 42 taken 116 times.
✓ Branch 43 taken 100 times.
✓ Branch 44 taken 116 times.
✓ Branch 45 taken 100 times.
✓ Branch 46 taken 116 times.
✓ Branch 47 taken 100 times.
✓ Branch 48 taken 116 times.
✓ Branch 49 taken 100 times.
✓ Branch 50 taken 96 times.
✓ Branch 51 taken 160 times.
✓ Branch 52 taken 2325 times.
✓ Branch 53 taken 1895 times.
✓ Branch 54 taken 820 times.
✓ Branch 55 taken 865 times.
✓ Branch 56 taken 720 times.
✓ Branch 57 taken 800 times.
✓ Branch 58 taken 740 times.
✓ Branch 59 taken 820 times.
✓ Branch 60 taken 520 times.
✓ Branch 61 taken 720 times.
✓ Branch 62 taken 60 times.
✓ Branch 63 taken 60 times.
✓ Branch 64 taken 60 times.
✓ Branch 65 taken 60 times.
✓ Branch 66 taken 60 times.
✓ Branch 67 taken 60 times.
✓ Branch 68 taken 60 times.
✓ Branch 69 taken 60 times.
✓ Branch 70 taken 60 times.
✓ Branch 71 taken 60 times.
✓ Branch 72 taken 76 times.
✓ Branch 73 taken 68 times.
✓ Branch 74 taken 60 times.
✓ Branch 75 taken 60 times.
✓ Branch 76 taken 104 times.
✓ Branch 77 taken 40 times.
✓ Branch 78 taken 16 times.
✓ Branch 79 taken 4 times.
✓ Branch 80 taken 56 times.
✓ Branch 81 taken 28 times.
✓ Branch 82 taken 56 times.
✓ Branch 83 taken 28 times.
✓ Branch 84 taken 224 times.
✓ Branch 85 taken 112 times.
✓ Branch 86 taken 152 times.
✓ Branch 87 taken 112 times.
✓ Branch 88 taken 180 times.
✓ Branch 89 taken 100 times.
✓ Branch 90 taken 48 times.
✓ Branch 91 taken 12 times.
✓ Branch 92 taken 16 times.
✓ Branch 93 taken 4 times.
✓ Branch 94 taken 180 times.
✓ Branch 95 taken 100 times.
✓ Branch 96 taken 180 times.
✓ Branch 97 taken 100 times.
✓ Branch 98 taken 180 times.
✓ Branch 99 taken 100 times.
✓ Branch 100 taken 152 times.
✓ Branch 101 taken 112 times.
✓ Branch 102 taken 464 times.
✓ Branch 103 taken 400 times.
✓ Branch 104 taken 170 times.
✓ Branch 105 taken 160 times.
✓ Branch 106 taken 360 times.
✓ Branch 107 taken 320 times.
✓ Branch 108 taken 255 times.
✓ Branch 109 taken 240 times.
✓ Branch 110 taken 232 times.
✓ Branch 111 taken 200 times.
✓ Branch 112 taken 232 times.
✓ Branch 113 taken 200 times.
✓ Branch 114 taken 464 times.
✓ Branch 115 taken 400 times.
✓ Branch 116 taken 464 times.
✓ Branch 117 taken 400 times.
✓ Branch 118 taken 464 times.
✓ Branch 119 taken 400 times.
✓ Branch 120 taken 464 times.
✓ Branch 121 taken 400 times.
✓ Branch 122 taken 232 times.
✓ Branch 123 taken 200 times.
32366 if (task_end == task_count) {
68 15012 end = count;
69 15012 }
70 64732 return callback(begin, end);
71 32366 };
72 30024 return mt.parallel(kleidicv_thread_std_function_callback, &f,
73 15012 mt.parallel_data, task_count);
74 15012 }
75
76 template <typename SrcT, typename DstT, typename F, typename... Args>
77 4047 inline kleidicv_error_t kleidicv_thread_unary_op_impl(
78 F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride,
79 DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) {
80 13291 auto callback = [=](unsigned begin, unsigned end) {
81 18488 return f(src + static_cast<ptrdiff_t>(begin * src_stride / sizeof(SrcT)),
82 9244 src_stride,
83 9244 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
84 9244 dst_stride, width, end - begin, args...);
85 };
86 8094 return parallel_batches(callback, mt, height);
87 4047 }
88
89 template <typename SrcT, typename DstT, typename F, typename... Args>
90 3000 inline kleidicv_error_t kleidicv_thread_binary_op_impl(
91 F f, kleidicv_thread_multithreading mt, const SrcT *src_a,
92 size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst,
93 size_t dst_stride, size_t width, size_t height, Args... args) {
94 9480 auto callback = [=](unsigned begin, unsigned end) {
95 12960 return f(
96 6480 src_a + static_cast<ptrdiff_t>(begin * src_a_stride / sizeof(SrcT)),
97 6480 src_a_stride,
98 6480 src_b + static_cast<ptrdiff_t>(begin * src_b_stride / sizeof(SrcT)),
99 6480 src_b_stride,
100 6480 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
101 6480 dst_stride, width, end - begin, args...);
102 };
103 6000 return parallel_batches(callback, mt, height);
104 3000 }
105
106 #define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \
107 kleidicv_error_t kleidicv_thread_##suffix( \
108 const src_type *src, size_t src_stride, dst_type *dst, \
109 size_t dst_stride, size_t width, size_t height, \
110 kleidicv_thread_multithreading mt) { \
111 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
112 src_stride, dst, dst_stride, width, \
113 height); \
114 }
115
116 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t);
117 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t);
118 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t);
119 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t);
120 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t);
121 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t);
122 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t);
123 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t);
124 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t);
125 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t);
126 120 KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float);
127 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_s8, float, int8_t);
128 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_u8, float, uint8_t);
129 100 KLEIDICV_THREAD_UNARY_OP_IMPL(s8_to_f32, int8_t, float);
130 100 KLEIDICV_THREAD_UNARY_OP_IMPL(u8_to_f32, uint8_t, float);
131
132 #define KLEIDICV_THREAD_INRANGE_OP_IMPL(suffix, src_type, dst_type) \
133 kleidicv_error_t kleidicv_thread_##suffix( \
134 const src_type *src, size_t src_stride, dst_type *dst, \
135 size_t dst_stride, size_t width, size_t height, src_type lower_bound, \
136 src_type upper_bound, kleidicv_thread_multithreading mt) { \
137 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
138 src_stride, dst, dst_stride, width, \
139 height, lower_bound, upper_bound); \
140 }
141
142 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_u8, uint8_t, uint8_t);
143 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_f32, float, uint8_t);
144
145 100 kleidicv_error_t kleidicv_thread_threshold_binary_u8(
146 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
147 size_t width, size_t height, uint8_t threshold, uint8_t value,
148 kleidicv_thread_multithreading mt) {
149 200 return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src,
150 100 src_stride, dst, dst_stride, width,
151 100 height, threshold, value);
152 }
153
154 116 kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
155 uint8_t *dst, size_t dst_stride,
156 size_t width, size_t height,
157 double scale, double shift,
158 kleidicv_thread_multithreading mt) {
159
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 116 times.
116 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
160
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 116 times.
116 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
161
5/6
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 112 times.
116 CHECK_IMAGE_SIZE(width, height);
162
163 112 const std::array<uint8_t, 256> precalculated_table =
164 112 kleidicv::neon::precalculate_scale_table_u8(scale, shift);
165 112 return kleidicv_thread_unary_op_impl(
166 112 kleidicv::neon::scale_with_precalculated_table_u8, mt, src, src_stride,
167 112 dst, dst_stride, width, height, scale, shift, precalculated_table);
168 116 }
169
170 120 kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
171 float *dst, size_t dst_stride,
172 size_t width, size_t height,
173 double scale, double shift,
174 kleidicv_thread_multithreading mt) {
175 240 return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride,
176 120 dst, dst_stride, width, height, scale,
177 120 shift);
178 }
179
180 100 kleidicv_error_t kleidicv_thread_scale_u8_f16(
181 const uint8_t *src, size_t src_stride, float16_t *dst, size_t dst_stride,
182 size_t width, size_t height, double scale, double shift,
183 kleidicv_thread_multithreading mt) {
184 200 return kleidicv_thread_unary_op_impl(kleidicv_scale_u8_f16, mt, src,
185 100 src_stride, dst, dst_stride, width,
186 100 height, scale, shift);
187 }
188
189 #define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \
190 kleidicv_error_t kleidicv_thread_##suffix( \
191 const type *src_a, size_t src_a_stride, const type *src_b, \
192 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
193 size_t height, kleidicv_thread_multithreading mt) { \
194 return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \
195 src_a_stride, src_b, src_b_stride, \
196 dst, dst_stride, width, height); \
197 }
198
199 #define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \
200 kleidicv_error_t kleidicv_thread_##suffix( \
201 const type *src_a, size_t src_a_stride, const type *src_b, \
202 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
203 size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \
204 return kleidicv_thread_binary_op_impl( \
205 kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \
206 dst_stride, width, height, scale); \
207 }
208
209 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t);
210 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t);
211 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t);
212 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t);
213 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t);
214 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t);
215 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t);
216 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t);
217 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t);
218 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t);
219 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t);
220 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t);
221 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t);
222 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t);
223 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t);
224 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t);
225 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t);
226 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t);
227 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t);
228 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t);
229 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t);
230 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double);
231 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double);
232 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double);
233 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double);
234 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double);
235 100 KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t);
236 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t);
237 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t);
238
239 100 kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16(
240 const int16_t *src_a, size_t src_a_stride, const int16_t *src_b,
241 size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width,
242 size_t height, int16_t threshold, kleidicv_thread_multithreading mt) {
243 100 return kleidicv_thread_binary_op_impl(
244 100 kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride,
245 100 src_b, src_b_stride, dst, dst_stride, width, height, threshold);
246 }
247
248 172 kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride,
249 size_t width, size_t height, void *dst,
250 size_t dst_stride, int angle,
251 size_t element_size,
252 kleidicv_thread_multithreading mt) {
253
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 160 times.
172 if (!kleidicv::rotate_is_implemented(src, dst, angle, element_size)) {
254 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
255 }
256 // reading in columns and writing out rows tends to perform better
257 416 auto callback = [=](unsigned begin, unsigned end) {
258 512 return kleidicv_rotate(
259 256 static_cast<const uint8_t *>(src) + begin * element_size, src_stride,
260 256 end - begin, height, static_cast<uint8_t *>(dst) + begin * dst_stride,
261 256 dst_stride, angle, element_size);
262 };
263 160 return parallel_batches(callback, mt, width, 64);
264 172 }
265
266 2045 kleidicv_error_t kleidicv_thread_yuv_to_rgb_u8(
267 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
268 size_t width, size_t height, kleidicv_color_conversion_t color_format,
269 kleidicv_thread_multithreading mt) {
270 // Extract the base format
271 4090 const size_t base_format = static_cast<size_t>(
272 2045 color_format & KLEIDICV_COLOR_CONVERSION_YUV_FMT_MASK);
273
2/2
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 1645 times.
2045 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV444) {
274 800 return kleidicv_thread_unary_op_impl(kleidicv_yuv444_to_rgb_u8, mt, src,
275 400 src_stride, dst, dst_stride, width,
276 400 height, color_format);
277 }
278
279
2/2
✓ Branch 0 taken 780 times.
✓ Branch 1 taken 865 times.
1645 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV422) {
280 1560 return kleidicv_thread_unary_op_impl(kleidicv_yuv422_to_rgb_u8, mt, src,
281 780 src_stride, dst, dst_stride, width,
282 780 height, color_format);
283 }
284
285 2550 auto callback = [=](unsigned begin, unsigned end) {
286 3370 return kleidicv_yuv420p_to_rgb_stripe_u8(
287 1685 src, src_stride, dst, dst_stride, width, height, color_format,
288 1685 static_cast<size_t>(begin), static_cast<size_t>(end));
289 };
290 865 return parallel_batches(callback, mt, (height + 1) / 2);
291 2045 }
292
293 800 kleidicv_error_t kleidicv_thread_rgb_to_yuv_semiplanar_u8(
294 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
295 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
296 kleidicv_color_conversion_t color_format,
297 kleidicv_thread_multithreading mt) {
298 2320 auto callback = [=](unsigned begin, unsigned end) {
299 3040 return kleidicv_rgb_to_yuv420sp_stripe_u8(
300 1520 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
301 1520 color_format, static_cast<size_t>(begin), static_cast<size_t>(end));
302 };
303 1600 return parallel_batches(callback, mt, (height + 1) / 2);
304 800 }
305
306 1535 kleidicv_error_t kleidicv_thread_rgb_to_yuv_u8(
307 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
308 size_t width, size_t height, kleidicv_color_conversion_t color_format,
309 kleidicv_thread_multithreading mt) {
310 // Extract the base format
311 3070 const size_t base_format = static_cast<size_t>(
312 1535 color_format & KLEIDICV_COLOR_CONVERSION_YUV_FMT_MASK);
313
2/2
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 1135 times.
1535 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV444) {
314 800 return kleidicv_thread_unary_op_impl(kleidicv_rgb_to_yuv444_u8, mt, src,
315 400 src_stride, dst, dst_stride, width,
316 400 height, color_format);
317 }
318
319
2/2
✓ Branch 0 taken 315 times.
✓ Branch 1 taken 820 times.
1135 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV422) {
320 630 return kleidicv_thread_unary_op_impl(kleidicv_rgb_to_yuv422_u8, mt, src,
321 315 src_stride, dst, dst_stride, width,
322 315 height, color_format);
323 }
324
325 2380 auto callback = [=](unsigned begin, unsigned end) {
326 3120 return kleidicv_rgb_to_yuv420p_stripe_u8(
327 1560 src, src_stride, dst, dst_stride, width, height, color_format,
328 1560 static_cast<size_t>(begin), static_cast<size_t>(end));
329 };
330 820 return parallel_batches(callback, mt, (height + 1) / 2);
331 1535 }
332
333 720 kleidicv_error_t kleidicv_thread_yuv_semiplanar_to_rgb_u8(
334 const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
335 size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
336 size_t height, kleidicv_color_conversion_t color_format,
337 kleidicv_thread_multithreading mt) {
338 1960 auto callback = [=](unsigned begin, unsigned end) {
339 1240 size_t row_begin = size_t{begin} * 2;
340 1240 size_t row_end = std::min<size_t>(height, size_t{end} * 2);
341 1240 size_t row_uv = begin;
342 3720 return kleidicv_yuv_semiplanar_to_rgb_u8(
343 1240 src_y + row_begin * src_y_stride, src_y_stride,
344 1240 src_uv + row_uv * src_uv_stride, src_uv_stride,
345 1240 dst + row_begin * dst_stride, dst_stride, width, row_end - row_begin,
346 1240 color_format);
347 1240 };
348 1440 return parallel_batches(callback, mt, (height + 1) / 2);
349 720 }
350
351 template <typename ScalarType, typename FunctionType>
352 368 kleidicv_error_t parallel_min_max(FunctionType min_max_func,
353 const ScalarType *src, size_t src_stride,
354 size_t width, size_t height,
355 ScalarType *p_min_value,
356 ScalarType *p_max_value,
357 kleidicv_thread_multithreading mt) {
358 736 std::vector<ScalarType> min_values(height,
359 368 std::numeric_limits<ScalarType>::max());
360 736 std::vector<ScalarType> max_values(height,
361 368 std::numeric_limits<ScalarType>::lowest());
362
363 1112 auto callback = [&](unsigned begin, unsigned end) {
364 1488 return min_max_func(src + begin * (src_stride / sizeof(ScalarType)),
365 744 src_stride, width, end - begin,
366
12/12
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 104 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 104 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 104 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 104 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 16 times.
744 p_min_value ? min_values.data() + begin : nullptr,
367
12/12
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 104 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 104 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 104 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 104 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 16 times.
744 p_max_value ? max_values.data() + begin : nullptr);
368 };
369
370 368 auto return_val = parallel_batches(callback, mt, height);
371
372
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_min_value) {
373 320 *p_min_value = std::numeric_limits<ScalarType>::max();
374
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : min_values) {
375
12/12
✓ Branch 0 taken 218 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 218 times.
✓ Branch 3 taken 70 times.
✓ Branch 4 taken 216 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 216 times.
✓ Branch 7 taken 72 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 72 times.
✓ Branch 10 taken 289 times.
✓ Branch 11 taken 79 times.
1808 if (m < *p_min_value) {
376 435 *p_min_value = m;
377 435 }
378 1808 }
379 320 }
380
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_max_value) {
381 320 *p_max_value = std::numeric_limits<ScalarType>::lowest();
382
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : max_values) {
383
12/12
✓ Branch 0 taken 211 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 211 times.
✓ Branch 3 taken 77 times.
✓ Branch 4 taken 207 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 207 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 207 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 281 times.
✓ Branch 11 taken 87 times.
1808 if (m > *p_max_value) {
384 484 *p_max_value = m;
385 484 }
386 1808 }
387 320 }
388 368 return return_val;
389 368 }
390
391 #define DEFINE_KLEIDICV_THREAD_MIN_MAX(suffix, type) \
392 kleidicv_error_t kleidicv_thread_min_max_##suffix( \
393 const type *src, size_t src_stride, size_t width, size_t height, \
394 type *p_min_value, type *p_max_value, \
395 kleidicv_thread_multithreading mt) { \
396 return parallel_min_max(kleidicv_min_max_##suffix, src, src_stride, width, \
397 height, p_min_value, p_max_value, mt); \
398 }
399
400 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u8, uint8_t);
401 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s8, int8_t);
402 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t);
403 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t);
404 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t);
405 68 DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float);
406
407 template <typename ScalarType, typename FunctionType>
408 60 kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func,
409 const ScalarType *src, size_t src_stride,
410 size_t width, size_t height,
411 size_t *p_min_offset,
412 size_t *p_max_offset,
413 kleidicv_thread_multithreading mt) {
414 60 std::vector<size_t> min_offsets(height, 0);
415 60 std::vector<size_t> max_offsets(height, 0);
416
417 180 auto callback = [&](unsigned begin, unsigned end) {
418 240 return min_max_loc_func(
419 120 src + begin * (src_stride / sizeof(ScalarType)), src_stride, width,
420
2/2
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
120 end - begin, p_min_offset ? min_offsets.data() + begin : nullptr,
421
2/2
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
120 p_max_offset ? max_offsets.data() + begin : nullptr);
422 };
423 60 auto return_val = parallel_batches(callback, mt, height);
424
425
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_min_offset) {
426 52 *p_min_offset = 0;
427
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < min_offsets.size(); ++i) {
428 288 size_t offs = min_offsets[i] + i * src_stride;
429
4/4
✓ Branch 0 taken 256 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 256 times.
✓ Branch 3 taken 32 times.
576 if (src[offs / sizeof(ScalarType)] <
430 288 src[*p_min_offset / sizeof(ScalarType)]) {
431 32 *p_min_offset = offs;
432 32 }
433 288 }
434 52 }
435
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_max_offset) {
436 52 *p_max_offset = 0;
437
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < max_offsets.size(); ++i) {
438 288 size_t offs = max_offsets[i] + i * src_stride;
439
4/4
✓ Branch 0 taken 230 times.
✓ Branch 1 taken 58 times.
✓ Branch 2 taken 230 times.
✓ Branch 3 taken 58 times.
576 if (src[offs / sizeof(ScalarType)] >
440 288 src[*p_max_offset / sizeof(ScalarType)]) {
441 58 *p_max_offset = offs;
442 58 }
443 288 }
444 52 }
445 60 return return_val;
446 60 }
447
448 #define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \
449 kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \
450 const type *src, size_t src_stride, size_t width, size_t height, \
451 size_t *p_min_offset, size_t *p_max_offset, \
452 kleidicv_thread_multithreading mt) { \
453 return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \
454 src_stride, width, height, p_min_offset, \
455 p_max_offset, mt); \
456 }
457
458 60 DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t);
459
460 308 kleidicv_error_t kleidicv_thread_gaussian_blur_u8(
461 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
462 size_t width, size_t height, size_t channels, size_t kernel_width,
463 size_t kernel_height, float sigma_x, float sigma_y,
464 kleidicv_border_type_t border_type, kleidicv_thread_multithreading mt) {
465 308 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
466
4/4
✓ Branch 0 taken 304 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 260 times.
✓ Branch 3 taken 44 times.
308 if (!fixed_border_type ||
467 608 !kleidicv::gaussian_blur_is_implemented(width, height, kernel_width,
468 304 kernel_height, sigma_x, sigma_y,
469 304 channels, *fixed_border_type)) {
470 264 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
471 }
472
473
4/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 4 times.
44 if (kernel_width <= 9 || kernel_width == 15 || kernel_width == 21) {
474 184 auto callback = [=](size_t y_begin, size_t y_end) {
475 288 return kleidicv_gaussian_blur_fixed_stripe_u8(
476 144 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
477 144 channels, kernel_width, kernel_height, sigma_x, sigma_y,
478 144 *fixed_border_type);
479 };
480 40 return parallel_batches(callback, mt, height);
481 40 }
482 24 auto callback = [=](size_t y_begin, size_t y_end) {
483 40 return kleidicv_gaussian_blur_arbitrary_stripe_u8(
484 20 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
485 20 channels, kernel_width, kernel_height, sigma_x, sigma_y,
486 20 *fixed_border_type);
487 };
488 4 return parallel_batches(callback, mt, height);
489 308 }
490
491 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u8(
492 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
493 size_t width, size_t height, size_t channels, const uint8_t *kernel_x,
494 size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height,
495 kleidicv_border_type_t border_type, kleidicv_thread_multithreading mt) {
496
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
497 108 kernel_height)) {
498 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
499 }
500
501 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
502
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
503 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
504 }
505
506 112 auto callback = [=](size_t y_begin, size_t y_end) {
507 168 return kleidicv_separable_filter_2d_stripe_u8(
508 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
509 84 channels, kernel_x, kernel_width, kernel_y, kernel_height,
510 84 *fixed_border_type);
511 };
512 28 return parallel_batches(callback, mt, height);
513 108 }
514
515 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u16(
516 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
517 size_t width, size_t height, size_t channels, const uint16_t *kernel_x,
518 size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height,
519 kleidicv_border_type_t border_type, kleidicv_thread_multithreading mt) {
520
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
521 108 kernel_height)) {
522 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
523 }
524
525 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
526
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
527 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
528 }
529
530 112 auto callback = [=](size_t y_begin, size_t y_end) {
531 168 return kleidicv_separable_filter_2d_stripe_u16(
532 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
533 84 channels, kernel_x, kernel_width, kernel_y, kernel_height,
534 84 *fixed_border_type);
535 };
536 28 return parallel_batches(callback, mt, height);
537 108 }
538
539 408 kleidicv_error_t kleidicv_thread_blur_and_downsample_u8(
540 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
541 uint8_t *dst, size_t dst_stride, size_t channels,
542 kleidicv_border_type_t border_type, kleidicv_thread_multithreading mt) {
543
4/4
✓ Branch 0 taken 292 times.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 292 times.
✓ Branch 3 taken 116 times.
816 if (!kleidicv::blur_and_downsample_is_implemented(src_width, src_height,
544 408 channels)) {
545 292 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
546 }
547
548 116 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
549
2/2
✓ Branch 0 taken 112 times.
✓ Branch 1 taken 4 times.
116 if (!fixed_border_type) {
550 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
551 }
552
553 448 auto callback = [=](unsigned y_begin, unsigned y_end) {
554 672 return kleidicv_blur_and_downsample_stripe_u8(
555 336 src, src_stride, src_width, src_height, dst, dst_stride, y_begin, y_end,
556 336 channels, *fixed_border_type);
557 };
558 112 return parallel_batches(callback, mt, src_height);
559 408 }
560
561 204 kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8(
562 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
563 size_t width, size_t height, size_t channels,
564 kleidicv_thread_multithreading mt) {
565
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
566 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
567 }
568
569 376 auto callback = [=](unsigned y_begin, unsigned y_end) {
570 528 return kleidicv_sobel_3x3_horizontal_stripe_s16_u8(
571 264 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
572 264 channels);
573 };
574 112 return parallel_batches(callback, mt, height);
575 204 }
576
577 532 kleidicv_error_t kleidicv_thread_median_blur_u8(
578 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
579 size_t width, size_t height, size_t channels, size_t kernel_width,
580 size_t kernel_height, kleidicv_border_type_t border_type,
581 kleidicv_thread_multithreading mt) {
582 1064 auto result_pair = kleidicv::median_blur_is_implemented(
583 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
584 532 kernel_height, border_type);
585
586 532 auto checks_result = result_pair.first;
587 532 auto fixed_border_type = result_pair.second;
588
2/2
✓ Branch 0 taken 416 times.
✓ Branch 1 taken 116 times.
532 if (checks_result != KLEIDICV_OK) {
589 416 return checks_result;
590 }
591
592
2/2
✓ Branch 0 taken 100 times.
✓ Branch 1 taken 16 times.
116 if (kernel_width <= 7) {
593 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
594 560 return kleidicv_median_blur_sorting_network_stripe_u8(
595 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
596 280 channels, kernel_width, kernel_height, fixed_border_type);
597 };
598 100 return parallel_batches(callback, mt, height);
599 100 }
600
601
3/4
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 12 times.
16 if (kernel_width > 7 && kernel_width <= 15) {
602 72 auto callback = [=](unsigned y_begin, unsigned y_end) {
603 120 return kleidicv_median_blur_small_hist_stripe_u8(
604 60 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
605 60 channels, kernel_width, kernel_height, fixed_border_type);
606 };
607 12 return parallel_batches(callback, mt, height);
608 12 }
609
610 24 auto callback = [=](unsigned y_begin, unsigned y_end) {
611 40 return kleidicv_median_blur_large_hist_stripe_u8(
612 20 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
613 20 channels, kernel_width, kernel_height, fixed_border_type);
614 };
615 4 return parallel_batches(callback, mt, height);
616 532 }
617
618 532 kleidicv_error_t kleidicv_thread_median_blur_s16(
619 const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
620 size_t width, size_t height, size_t channels, size_t kernel_width,
621 size_t kernel_height, kleidicv_border_type_t border_type,
622 kleidicv_thread_multithreading mt) {
623 1064 auto result_pair = kleidicv::median_blur_is_implemented(
624 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
625 532 kernel_height, border_type);
626
627 532 auto checks_result = result_pair.first;
628 532 auto fixed_border_type = result_pair.second;
629
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
630 432 return checks_result;
631 }
632
633 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
634 560 return kleidicv_median_blur_sorting_network_stripe_s16(
635 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
636 280 channels, kernel_width, kernel_height, fixed_border_type);
637 };
638 100 return parallel_batches(callback, mt, height);
639 532 }
640
641 532 kleidicv_error_t kleidicv_thread_median_blur_u16(
642 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
643 size_t width, size_t height, size_t channels, size_t kernel_width,
644 size_t kernel_height, kleidicv_border_type_t border_type,
645 kleidicv_thread_multithreading mt) {
646 1064 auto result_pair = kleidicv::median_blur_is_implemented(
647 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
648 532 kernel_height, border_type);
649
650 532 auto checks_result = result_pair.first;
651 532 auto fixed_border_type = result_pair.second;
652
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
653 432 return checks_result;
654 }
655
656 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
657 560 return kleidicv_median_blur_sorting_network_stripe_u16(
658 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
659 280 channels, kernel_width, kernel_height, fixed_border_type);
660 };
661 100 return parallel_batches(callback, mt, height);
662 532 }
663
664 532 kleidicv_error_t kleidicv_thread_median_blur_f32(
665 const float *src, size_t src_stride, float *dst, size_t dst_stride,
666 size_t width, size_t height, size_t channels, size_t kernel_width,
667 size_t kernel_height, kleidicv_border_type_t border_type,
668 kleidicv_thread_multithreading mt) {
669 1064 auto result_pair = kleidicv::median_blur_is_implemented(
670 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
671 532 kernel_height, border_type);
672
673 532 auto checks_result = result_pair.first;
674 532 auto fixed_border_type = result_pair.second;
675
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
676 432 return checks_result;
677 }
678
679 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
680 560 return kleidicv_median_blur_sorting_network_stripe_f32(
681 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
682 280 channels, kernel_width, kernel_height, fixed_border_type);
683 };
684 100 return parallel_batches(callback, mt, height);
685 532 }
686
687 204 kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8(
688 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
689 size_t width, size_t height, size_t channels,
690 kleidicv_thread_multithreading mt) {
691
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
692 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
693 }
694
695 376 auto callback = [=](unsigned y_begin, unsigned y_end) {
696 528 return kleidicv_sobel_3x3_vertical_stripe_s16_u8(src, src_stride, dst,
697 264 dst_stride, width, height,
698 264 y_begin, y_end, channels);
699 };
700 112 return parallel_batches(callback, mt, height);
701 204 }
702
703 404 kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8(
704 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
705 size_t src_channels, int16_t *dst, size_t dst_stride,
706 kleidicv_thread_multithreading mt) {
707
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 400 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 400 times.
808 if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height,
708 404 src_channels)) {
709 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
710 }
711
712 1264 auto callback = [=](unsigned y_begin, unsigned y_end) {
713 1728 return kleidicv_scharr_interleaved_stripe_s16_u8(
714 864 src, src_stride, src_width, src_height, src_channels, dst, dst_stride,
715 864 y_begin, y_end);
716 };
717
718 // height is decremented by 2 as the result has less rows.
719 400 return parallel_batches(callback, mt, src_height - 2);
720 404 }
721
722 505 kleidicv_error_t kleidicv_thread_resize_linear_u8(
723 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
724 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
725 size_t channels, kleidicv_thread_multithreading mt) {
726
2/2
✓ Branch 0 taken 25 times.
✓ Branch 1 taken 480 times.
505 if (!kleidicv::resize_linear_u8_is_implemented(
727 505 src_width, src_height, dst_width, dst_height, channels)) {
728 25 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
729 }
730
731
2/2
✓ Branch 0 taken 160 times.
✓ Branch 1 taken 320 times.
480 if (dst_height > src_height) {
732 490 auto callback = [=](unsigned y_begin, unsigned y_end) {
733 330 return kleidicv_resize_linear_stripe_u8(
734 330 src, src_stride, src_width, src_height, y_begin,
735 330 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
736 330 dst_height, channels);
737 };
738 160 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
739 160 }
740 1000 auto callback = [=](unsigned y_begin, unsigned y_end) {
741 680 return kleidicv_resize_linear_stripe_u8(
742 680 src, src_stride, src_width, src_height, y_begin, y_end, dst, dst_stride,
743 680 dst_width, dst_height, channels);
744 };
745 320 return parallel_batches(callback, mt, dst_height);
746 505 }
747
748 250 kleidicv_error_t kleidicv_thread_resize_linear_f32(
749 const float *src, size_t src_stride, size_t src_width, size_t src_height,
750 float *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
751 size_t channels, kleidicv_thread_multithreading mt) {
752
2/2
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 240 times.
250 if (!kleidicv::resize_linear_f32_is_implemented(
753 250 src_width, src_height, dst_width, dst_height, channels)) {
754 10 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
755 }
756 735 auto callback = [=](unsigned y_begin, unsigned y_end) {
757 990 return kleidicv_resize_linear_stripe_f32(
758 495 src, src_stride, src_width, src_height, y_begin,
759 495 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
760 495 dst_height);
761 };
762 240 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
763 250 }
764
765 208 kleidicv_error_t kleidicv_thread_remap_s16_u8(
766 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
767 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
768 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
769 kleidicv_border_type_t border_type, const uint8_t *border_value,
770 kleidicv_thread_multithreading mt) {
771
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint8_t>(src_stride, src_width,
772 208 src_height, dst_width,
773 208 border_type, channels)) {
774 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
775 }
776 632 auto callback = [=](unsigned begin, unsigned end) {
777 864 return kleidicv_remap_s16_u8(
778 432 src, src_stride, src_width, src_height,
779 432 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
780 432 end - begin, channels,
781 432 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
782 432 mapxy_stride, border_type, border_value);
783 };
784 200 return parallel_batches(callback, mt, dst_height);
785 208 }
786
787 208 kleidicv_error_t kleidicv_thread_remap_s16_u16(
788 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
789 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
790 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
791 kleidicv_border_type_t border_type, const uint16_t *border_value,
792 kleidicv_thread_multithreading mt) {
793
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint16_t>(src_stride, src_width,
794 208 src_height, dst_width,
795 208 border_type, channels)) {
796 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
797 }
798 632 auto callback = [=](unsigned begin, unsigned end) {
799 864 return kleidicv_remap_s16_u16(
800 432 src, src_stride, src_width, src_height,
801 432 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
802 432 dst_stride, dst_width, end - begin, channels,
803 432 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
804 432 mapxy_stride, border_type, border_value);
805 };
806 200 return parallel_batches(callback, mt, dst_height);
807 208 }
808
809 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
810 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
811 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
812 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
813 const uint16_t *mapfrac, size_t mapfrac_stride,
814 kleidicv_border_type_t border_type, const uint8_t *border_value,
815 kleidicv_thread_multithreading mt) {
816
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint8_t>(
817 408 src_stride, src_width, src_height, dst_width, border_type,
818 408 channels)) {
819 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
820 }
821 1264 auto callback = [=](unsigned begin, unsigned end) {
822 1728 return kleidicv_remap_s16point5_u8(
823 864 src, src_stride, src_width, src_height,
824 864 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
825 864 end - begin, channels,
826 864 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
827 864 mapxy_stride,
828 1728 mapfrac +
829 864 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
830 864 mapfrac_stride, border_type, border_value);
831 };
832 400 return parallel_batches(callback, mt, dst_height);
833 408 }
834
835 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u16(
836 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
837 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
838 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
839 const uint16_t *mapfrac, size_t mapfrac_stride,
840 kleidicv_border_type_t border_type, const uint16_t *border_value,
841 kleidicv_thread_multithreading mt) {
842
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint16_t>(
843 408 src_stride, src_width, src_height, dst_width, border_type,
844 408 channels)) {
845 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
846 }
847 1264 auto callback = [=](unsigned begin, unsigned end) {
848 1728 return kleidicv_remap_s16point5_u16(
849 864 src, src_stride, src_width, src_height,
850 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
851 864 dst_stride, dst_width, end - begin, channels,
852 864 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
853 864 mapxy_stride,
854 1728 mapfrac +
855 864 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
856 864 mapfrac_stride, border_type, border_value);
857 };
858 400 return parallel_batches(callback, mt, dst_height);
859 408 }
860
861 408 kleidicv_error_t kleidicv_thread_remap_f32_u8(
862 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
863 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
864 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
865 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
866 kleidicv_border_type_t border_type, const uint8_t *border_value,
867 kleidicv_thread_multithreading mt) {
868
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint8_t>(
869 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
870 408 channels, interpolation)) {
871 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
872 }
873 1264 auto callback = [=](unsigned begin, unsigned end) {
874 1728 return kleidicv_remap_f32_u8(
875 864 src, src_stride, src_width, src_height,
876 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)),
877 864 dst_stride, dst_width, end - begin, channels,
878 864 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
879 864 mapx_stride,
880 864 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
881 864 mapy_stride, interpolation, border_type, border_value);
882 };
883 400 return parallel_batches(callback, mt, dst_height);
884 408 }
885
886 408 kleidicv_error_t kleidicv_thread_remap_f32_u16(
887 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
888 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
889 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
890 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
891 kleidicv_border_type_t border_type, const uint16_t *border_value,
892 kleidicv_thread_multithreading mt) {
893
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint16_t>(
894 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
895 408 channels, interpolation)) {
896 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
897 }
898 1264 auto callback = [=](unsigned begin, unsigned end) {
899 1728 return kleidicv_remap_f32_u16(
900 864 src, src_stride, src_width, src_height,
901 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
902 864 dst_stride, dst_width, end - begin, channels,
903 864 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
904 864 mapx_stride,
905 864 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
906 864 mapy_stride, interpolation, border_type, border_value);
907 };
908 400 return parallel_batches(callback, mt, dst_height);
909 408 }
910
911 216 kleidicv_error_t kleidicv_thread_warp_perspective_u8(
912 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
913 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
914 const float transformation[9], size_t channels,
915 kleidicv_interpolation_type_t interpolation,
916 kleidicv_border_type_t border_type, const uint8_t *border_value,
917 kleidicv_thread_multithreading mt) {
918
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 200 times.
216 if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
919 216 dst_width, channels, interpolation, border_type)) {
920 16 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
921 }
922
923 632 auto callback = [=](unsigned y_begin, unsigned y_end) {
924 864 return kleidicv_warp_perspective_stripe_u8(
925 432 src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
926 432 dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1),
927 432 transformation, channels, interpolation, border_type, border_value);
928 };
929 200 return parallel_batches(callback, mt, dst_height);
930 216 }
931