KleidiCV Coverage Report


Directory: ./
File: kleidicv_thread/src/kleidicv_thread.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 581 581 100.0%
Functions: 316 316 100.0%
Branches: 362 374 96.8%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv_thread/kleidicv_thread.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <cstdint>
10 #include <functional>
11 #include <limits>
12 #include <vector>
13
14 #include "kleidicv/arithmetics/rotate.h"
15 #include "kleidicv/arithmetics/scale.h"
16 #include "kleidicv/conversions/rgb_to_yuv.h"
17 #include "kleidicv/conversions/yuv_to_rgb.h"
18 #include "kleidicv/ctypes.h"
19 #include "kleidicv/filters/blur_and_downsample.h"
20 #include "kleidicv/filters/gaussian_blur.h"
21 #include "kleidicv/filters/median_blur.h"
22 #include "kleidicv/filters/scharr.h"
23 #include "kleidicv/filters/separable_filter_2d.h"
24 #include "kleidicv/filters/sobel.h"
25 #include "kleidicv/kleidicv.h"
26 #include "kleidicv/resize/resize_linear.h"
27 #include "kleidicv/transform/remap.h"
28 #include "kleidicv/transform/warp_perspective.h"
29
30 typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback;
31
32 30790 static kleidicv_error_t kleidicv_thread_std_function_callback(
33 unsigned task_begin, unsigned task_end, void *data) {
34 30790 auto *callback = reinterpret_cast<FunctionCallback *>(data);
35 61580 return (*callback)(task_begin, task_end);
36 30790 }
37
38 // Operations in the Neon backend have both a vector path and a scalar path.
39 // The vector path is used to process most data and the scalar path is used to
40 // process the parts of the data that don't fit into the vector width.
41 // For floating point operations in particular, the results may be very slightly
42 // different between vector and scalar paths.
43 //
44 // When using multithreading, images are divided into parts to be processed by
45 // each thread, and this could change which parts of the data end up being
46 // processed by the vector and scalar paths.
47 //
48 // If an implementation is sensitive to these very slight differences, set
49 // min_batch_size to the Neon vector length (16 bytes). That makes every batch
50 // handed to a thread a multiple of the vector width; only the final batch may
51 // be longer to reach the end of the data. No batch can be shorter than vector
52 // length because that could change behaviour for operations that try to avoid
53 // the tail loop (see the TryToAvoidTailLoop class).
54 // This technique only works if the data is longer than vector length.
55 //
56 // On the other hand, measurements showed that increasing the batch size can
57 // cause degradation of the multithreaded performance.
58 template <typename Callback>
59 14344 inline kleidicv_error_t parallel_batches(Callback callback,
60 kleidicv_thread_multithreading mt,
61 unsigned count,
62 unsigned min_batch_size = 1) {
63 14344 const unsigned task_count = std::max(1U, (count) / min_batch_size);
64 45134 FunctionCallback f = [=](unsigned task_begin, unsigned task_end) {
65 30790 unsigned begin = task_begin * min_batch_size,
66 30790 end = task_end * min_batch_size;
67
126/126
✓ Branch 0 taken 1160 times.
✓ Branch 1 taken 1000 times.
✓ Branch 2 taken 328 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 116 times.
✓ Branch 5 taken 100 times.
✓ Branch 6 taken 116 times.
✓ Branch 7 taken 100 times.
✓ Branch 8 taken 116 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 116 times.
✓ Branch 11 taken 100 times.
✓ Branch 12 taken 232 times.
✓ Branch 13 taken 200 times.
✓ Branch 14 taken 116 times.
✓ Branch 15 taken 100 times.
✓ Branch 16 taken 128 times.
✓ Branch 17 taken 112 times.
✓ Branch 18 taken 328 times.
✓ Branch 19 taken 120 times.
✓ Branch 20 taken 116 times.
✓ Branch 21 taken 100 times.
✓ Branch 22 taken 348 times.
✓ Branch 23 taken 300 times.
✓ Branch 24 taken 696 times.
✓ Branch 25 taken 600 times.
✓ Branch 26 taken 348 times.
✓ Branch 27 taken 300 times.
✓ Branch 28 taken 348 times.
✓ Branch 29 taken 300 times.
✓ Branch 30 taken 348 times.
✓ Branch 31 taken 300 times.
✓ Branch 32 taken 232 times.
✓ Branch 33 taken 200 times.
✓ Branch 34 taken 232 times.
✓ Branch 35 taken 200 times.
✓ Branch 36 taken 232 times.
✓ Branch 37 taken 200 times.
✓ Branch 38 taken 116 times.
✓ Branch 39 taken 100 times.
✓ Branch 40 taken 116 times.
✓ Branch 41 taken 100 times.
✓ Branch 42 taken 116 times.
✓ Branch 43 taken 100 times.
✓ Branch 44 taken 116 times.
✓ Branch 45 taken 100 times.
✓ Branch 46 taken 116 times.
✓ Branch 47 taken 100 times.
✓ Branch 48 taken 116 times.
✓ Branch 49 taken 100 times.
✓ Branch 50 taken 96 times.
✓ Branch 51 taken 160 times.
✓ Branch 52 taken 2325 times.
✓ Branch 53 taken 1895 times.
✓ Branch 54 taken 820 times.
✓ Branch 55 taken 865 times.
✓ Branch 56 taken 720 times.
✓ Branch 57 taken 800 times.
✓ Branch 58 taken 740 times.
✓ Branch 59 taken 820 times.
✓ Branch 60 taken 520 times.
✓ Branch 61 taken 720 times.
✓ Branch 62 taken 60 times.
✓ Branch 63 taken 60 times.
✓ Branch 64 taken 60 times.
✓ Branch 65 taken 60 times.
✓ Branch 66 taken 60 times.
✓ Branch 67 taken 60 times.
✓ Branch 68 taken 60 times.
✓ Branch 69 taken 60 times.
✓ Branch 70 taken 60 times.
✓ Branch 71 taken 60 times.
✓ Branch 72 taken 76 times.
✓ Branch 73 taken 68 times.
✓ Branch 74 taken 60 times.
✓ Branch 75 taken 60 times.
✓ Branch 76 taken 56 times.
✓ Branch 77 taken 28 times.
✓ Branch 78 taken 16 times.
✓ Branch 79 taken 4 times.
✓ Branch 80 taken 56 times.
✓ Branch 81 taken 28 times.
✓ Branch 82 taken 56 times.
✓ Branch 83 taken 28 times.
✓ Branch 84 taken 56 times.
✓ Branch 85 taken 28 times.
✓ Branch 86 taken 56 times.
✓ Branch 87 taken 28 times.
✓ Branch 88 taken 152 times.
✓ Branch 89 taken 112 times.
✓ Branch 90 taken 180 times.
✓ Branch 91 taken 100 times.
✓ Branch 92 taken 48 times.
✓ Branch 93 taken 12 times.
✓ Branch 94 taken 16 times.
✓ Branch 95 taken 4 times.
✓ Branch 96 taken 180 times.
✓ Branch 97 taken 100 times.
✓ Branch 98 taken 180 times.
✓ Branch 99 taken 100 times.
✓ Branch 100 taken 180 times.
✓ Branch 101 taken 100 times.
✓ Branch 102 taken 152 times.
✓ Branch 103 taken 112 times.
✓ Branch 104 taken 116 times.
✓ Branch 105 taken 100 times.
✓ Branch 106 taken 110 times.
✓ Branch 107 taken 120 times.
✓ Branch 108 taken 110 times.
✓ Branch 109 taken 120 times.
✓ Branch 110 taken 165 times.
✓ Branch 111 taken 180 times.
✓ Branch 112 taken 232 times.
✓ Branch 113 taken 200 times.
✓ Branch 114 taken 232 times.
✓ Branch 115 taken 200 times.
✓ Branch 116 taken 464 times.
✓ Branch 117 taken 400 times.
✓ Branch 118 taken 464 times.
✓ Branch 119 taken 400 times.
✓ Branch 120 taken 464 times.
✓ Branch 121 taken 400 times.
✓ Branch 122 taken 464 times.
✓ Branch 123 taken 400 times.
✓ Branch 124 taken 232 times.
✓ Branch 125 taken 200 times.
30790 if (task_end == task_count) {
68 14344 end = count;
69 14344 }
70 61580 return callback(begin, end);
71 30790 };
72 28688 return mt.parallel(kleidicv_thread_std_function_callback, &f,
73 14344 mt.parallel_data, task_count);
74 14344 }
75
76 template <typename SrcT, typename DstT, typename F, typename... Args>
77 4047 inline kleidicv_error_t kleidicv_thread_unary_op_impl(
78 F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride,
79 DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) {
80 13291 auto callback = [=](unsigned begin, unsigned end) {
81 18488 return f(src + static_cast<ptrdiff_t>(begin * src_stride / sizeof(SrcT)),
82 9244 src_stride,
83 9244 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
84 9244 dst_stride, width, end - begin, args...);
85 };
86 8094 return parallel_batches(callback, mt, height);
87 4047 }
88
89 template <typename SrcT, typename DstT, typename F, typename... Args>
90 3000 inline kleidicv_error_t kleidicv_thread_binary_op_impl(
91 F f, kleidicv_thread_multithreading mt, const SrcT *src_a,
92 size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst,
93 size_t dst_stride, size_t width, size_t height, Args... args) {
94 9480 auto callback = [=](unsigned begin, unsigned end) {
95 12960 return f(
96 6480 src_a + static_cast<ptrdiff_t>(begin * src_a_stride / sizeof(SrcT)),
97 6480 src_a_stride,
98 6480 src_b + static_cast<ptrdiff_t>(begin * src_b_stride / sizeof(SrcT)),
99 6480 src_b_stride,
100 6480 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
101 6480 dst_stride, width, end - begin, args...);
102 };
103 6000 return parallel_batches(callback, mt, height);
104 3000 }
105
106 #define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \
107 kleidicv_error_t kleidicv_thread_##suffix( \
108 const src_type *src, size_t src_stride, dst_type *dst, \
109 size_t dst_stride, size_t width, size_t height, \
110 kleidicv_thread_multithreading mt) { \
111 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
112 src_stride, dst, dst_stride, width, \
113 height); \
114 }
115
116 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t);
117 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t);
118 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t);
119 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t);
120 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t);
121 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t);
122 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t);
123 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t);
124 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t);
125 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t);
126 120 KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float);
127 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_s8, float, int8_t);
128 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_u8, float, uint8_t);
129 100 KLEIDICV_THREAD_UNARY_OP_IMPL(s8_to_f32, int8_t, float);
130 100 KLEIDICV_THREAD_UNARY_OP_IMPL(u8_to_f32, uint8_t, float);
131
132 #define KLEIDICV_THREAD_INRANGE_OP_IMPL(suffix, src_type, dst_type) \
133 kleidicv_error_t kleidicv_thread_##suffix( \
134 const src_type *src, size_t src_stride, dst_type *dst, \
135 size_t dst_stride, size_t width, size_t height, src_type lower_bound, \
136 src_type upper_bound, kleidicv_thread_multithreading mt) { \
137 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
138 src_stride, dst, dst_stride, width, \
139 height, lower_bound, upper_bound); \
140 }
141
142 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_u8, uint8_t, uint8_t);
143 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_f32, float, uint8_t);
144
145 100 kleidicv_error_t kleidicv_thread_threshold_binary_u8(
146 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
147 size_t width, size_t height, uint8_t threshold, uint8_t value,
148 kleidicv_thread_multithreading mt) {
149 200 return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src,
150 100 src_stride, dst, dst_stride, width,
151 100 height, threshold, value);
152 }
153
154 116 kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
155 uint8_t *dst, size_t dst_stride,
156 size_t width, size_t height,
157 double scale, double shift,
158 kleidicv_thread_multithreading mt) {
159
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 116 times.
116 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
160
2/4
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 116 times.
116 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
161
5/6
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 112 times.
116 CHECK_IMAGE_SIZE(width, height);
162
163 112 const std::array<uint8_t, 256> precalculated_table =
164 112 kleidicv::neon::precalculate_scale_table_u8(scale, shift);
165 112 return kleidicv_thread_unary_op_impl(
166 112 kleidicv::neon::scale_with_precalculated_table_u8, mt, src, src_stride,
167 112 dst, dst_stride, width, height, scale, shift, precalculated_table);
168 116 }
169
170 120 kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
171 float *dst, size_t dst_stride,
172 size_t width, size_t height,
173 double scale, double shift,
174 kleidicv_thread_multithreading mt) {
175 240 return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride,
176 120 dst, dst_stride, width, height, scale,
177 120 shift);
178 }
179
180 100 kleidicv_error_t kleidicv_thread_scale_u8_f16(
181 const uint8_t *src, size_t src_stride, float16_t *dst, size_t dst_stride,
182 size_t width, size_t height, double scale, double shift,
183 kleidicv_thread_multithreading mt) {
184 200 return kleidicv_thread_unary_op_impl(kleidicv_scale_u8_f16, mt, src,
185 100 src_stride, dst, dst_stride, width,
186 100 height, scale, shift);
187 }
188
189 #define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \
190 kleidicv_error_t kleidicv_thread_##suffix( \
191 const type *src_a, size_t src_a_stride, const type *src_b, \
192 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
193 size_t height, kleidicv_thread_multithreading mt) { \
194 return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \
195 src_a_stride, src_b, src_b_stride, \
196 dst, dst_stride, width, height); \
197 }
198
199 #define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \
200 kleidicv_error_t kleidicv_thread_##suffix( \
201 const type *src_a, size_t src_a_stride, const type *src_b, \
202 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
203 size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \
204 return kleidicv_thread_binary_op_impl( \
205 kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \
206 dst_stride, width, height, scale); \
207 }
208
209 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t);
210 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t);
211 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t);
212 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t);
213 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t);
214 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t);
215 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t);
216 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t);
217 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t);
218 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t);
219 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t);
220 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t);
221 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t);
222 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t);
223 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t);
224 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t);
225 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t);
226 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t);
227 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t);
228 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t);
229 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t);
230 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double);
231 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double);
232 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double);
233 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double);
234 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double);
235 100 KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t);
236 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t);
237 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t);
238
239 100 kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16(
240 const int16_t *src_a, size_t src_a_stride, const int16_t *src_b,
241 size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width,
242 size_t height, int16_t threshold, kleidicv_thread_multithreading mt) {
243 100 return kleidicv_thread_binary_op_impl(
244 100 kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride,
245 100 src_b, src_b_stride, dst, dst_stride, width, height, threshold);
246 }
247
248 172 kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride,
249 size_t width, size_t height, void *dst,
250 size_t dst_stride, int angle,
251 size_t element_size,
252 kleidicv_thread_multithreading mt) {
253
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 160 times.
172 if (!kleidicv::rotate_is_implemented(src, dst, angle, element_size)) {
254 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
255 }
256 // reading in columns and writing out rows tends to perform better
257 416 auto callback = [=](unsigned begin, unsigned end) {
258 512 return kleidicv_rotate(
259 256 static_cast<const uint8_t *>(src) + begin * element_size, src_stride,
260 256 end - begin, height, static_cast<uint8_t *>(dst) + begin * dst_stride,
261 256 dst_stride, angle, element_size);
262 };
263 160 return parallel_batches(callback, mt, width, 64);
264 172 }
265
266 2045 kleidicv_error_t kleidicv_thread_yuv_to_rgb_u8(
267 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
268 size_t width, size_t height, kleidicv_color_conversion_t color_format,
269 kleidicv_thread_multithreading mt) {
270 // Extract the base format
271 4090 const size_t base_format = static_cast<size_t>(
272 2045 color_format & KLEIDICV_COLOR_CONVERSION_YUV_FMT_MASK);
273
2/2
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 1645 times.
2045 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV444) {
274 800 return kleidicv_thread_unary_op_impl(kleidicv_yuv444_to_rgb_u8, mt, src,
275 400 src_stride, dst, dst_stride, width,
276 400 height, color_format);
277 }
278
279
2/2
✓ Branch 0 taken 780 times.
✓ Branch 1 taken 865 times.
1645 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV422) {
280 1560 return kleidicv_thread_unary_op_impl(kleidicv_yuv422_to_rgb_u8, mt, src,
281 780 src_stride, dst, dst_stride, width,
282 780 height, color_format);
283 }
284
285 2550 auto callback = [=](unsigned begin, unsigned end) {
286 3370 return kleidicv_yuv420p_to_rgb_stripe_u8(
287 1685 src, src_stride, dst, dst_stride, width, height, color_format,
288 1685 static_cast<size_t>(begin), static_cast<size_t>(end));
289 };
290 865 return parallel_batches(callback, mt, (height + 1) / 2);
291 2045 }
292
293 800 kleidicv_error_t kleidicv_thread_rgb_to_yuv_semiplanar_u8(
294 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
295 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
296 kleidicv_color_conversion_t color_format,
297 kleidicv_thread_multithreading mt) {
298 2320 auto callback = [=](unsigned begin, unsigned end) {
299 3040 return kleidicv_rgb_to_yuv420sp_stripe_u8(
300 1520 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
301 1520 color_format, static_cast<size_t>(begin), static_cast<size_t>(end));
302 };
303 1600 return parallel_batches(callback, mt, (height + 1) / 2);
304 800 }
305
306 1535 kleidicv_error_t kleidicv_thread_rgb_to_yuv_u8(
307 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
308 size_t width, size_t height, kleidicv_color_conversion_t color_format,
309 kleidicv_thread_multithreading mt) {
310 // Extract the base format
311 3070 const size_t base_format = static_cast<size_t>(
312 1535 color_format & KLEIDICV_COLOR_CONVERSION_YUV_FMT_MASK);
313
2/2
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 1135 times.
1535 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV444) {
314 800 return kleidicv_thread_unary_op_impl(kleidicv_rgb_to_yuv444_u8, mt, src,
315 400 src_stride, dst, dst_stride, width,
316 400 height, color_format);
317 }
318
319
2/2
✓ Branch 0 taken 315 times.
✓ Branch 1 taken 820 times.
1135 if (base_format == KLEIDICV_COLOR_CONVERSION_FMT_YUV422) {
320 630 return kleidicv_thread_unary_op_impl(kleidicv_rgb_to_yuv422_u8, mt, src,
321 315 src_stride, dst, dst_stride, width,
322 315 height, color_format);
323 }
324
325 2380 auto callback = [=](unsigned begin, unsigned end) {
326 3120 return kleidicv_rgb_to_yuv420p_stripe_u8(
327 1560 src, src_stride, dst, dst_stride, width, height, color_format,
328 1560 static_cast<size_t>(begin), static_cast<size_t>(end));
329 };
330 820 return parallel_batches(callback, mt, (height + 1) / 2);
331 1535 }
332
333 720 kleidicv_error_t kleidicv_thread_yuv_semiplanar_to_rgb_u8(
334 const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
335 size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
336 size_t height, kleidicv_color_conversion_t color_format,
337 kleidicv_thread_multithreading mt) {
338 1960 auto callback = [=](unsigned begin, unsigned end) {
339 1240 size_t row_begin = size_t{begin} * 2;
340 1240 size_t row_end = std::min<size_t>(height, size_t{end} * 2);
341 1240 size_t row_uv = begin;
342 3720 return kleidicv_yuv_semiplanar_to_rgb_u8(
343 1240 src_y + row_begin * src_y_stride, src_y_stride,
344 1240 src_uv + row_uv * src_uv_stride, src_uv_stride,
345 1240 dst + row_begin * dst_stride, dst_stride, width, row_end - row_begin,
346 1240 color_format);
347 1240 };
348 1440 return parallel_batches(callback, mt, (height + 1) / 2);
349 720 }
350
351 template <typename ScalarType, typename FunctionType>
352 368 kleidicv_error_t parallel_min_max(FunctionType min_max_func,
353 const ScalarType *src, size_t src_stride,
354 size_t width, size_t height,
355 ScalarType *p_min_value,
356 ScalarType *p_max_value,
357 kleidicv_thread_multithreading mt) {
358 736 std::vector<ScalarType> min_values(height,
359 368 std::numeric_limits<ScalarType>::max());
360 736 std::vector<ScalarType> max_values(height,
361 368 std::numeric_limits<ScalarType>::lowest());
362
363 1112 auto callback = [&](unsigned begin, unsigned end) {
364 1488 return min_max_func(src + begin * (src_stride / sizeof(ScalarType)),
365 744 src_stride, width, end - begin,
366
12/12
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 104 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 104 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 104 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 104 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 16 times.
744 p_min_value ? min_values.data() + begin : nullptr,
367
12/12
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 104 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 104 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 104 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 104 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 16 times.
744 p_max_value ? max_values.data() + begin : nullptr);
368 };
369
370 368 auto return_val = parallel_batches(callback, mt, height);
371
372
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_min_value) {
373 320 *p_min_value = std::numeric_limits<ScalarType>::max();
374
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : min_values) {
375
12/12
✓ Branch 0 taken 214 times.
✓ Branch 1 taken 74 times.
✓ Branch 2 taken 214 times.
✓ Branch 3 taken 74 times.
✓ Branch 4 taken 214 times.
✓ Branch 5 taken 74 times.
✓ Branch 6 taken 214 times.
✓ Branch 7 taken 74 times.
✓ Branch 8 taken 214 times.
✓ Branch 9 taken 74 times.
✓ Branch 10 taken 284 times.
✓ Branch 11 taken 84 times.
1808 if (m < *p_min_value) {
376 454 *p_min_value = m;
377 454 }
378 1808 }
379 320 }
380
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_max_value) {
381 320 *p_max_value = std::numeric_limits<ScalarType>::lowest();
382
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : max_values) {
383
12/12
✓ Branch 0 taken 215 times.
✓ Branch 1 taken 73 times.
✓ Branch 2 taken 215 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 214 times.
✓ Branch 5 taken 74 times.
✓ Branch 6 taken 214 times.
✓ Branch 7 taken 74 times.
✓ Branch 8 taken 214 times.
✓ Branch 9 taken 74 times.
✓ Branch 10 taken 283 times.
✓ Branch 11 taken 85 times.
1808 if (m > *p_max_value) {
384 453 *p_max_value = m;
385 453 }
386 1808 }
387 320 }
388 368 return return_val;
389 368 }
390
391 #define DEFINE_KLEIDICV_THREAD_MIN_MAX(suffix, type) \
392 kleidicv_error_t kleidicv_thread_min_max_##suffix( \
393 const type *src, size_t src_stride, size_t width, size_t height, \
394 type *p_min_value, type *p_max_value, \
395 kleidicv_thread_multithreading mt) { \
396 return parallel_min_max(kleidicv_min_max_##suffix, src, src_stride, width, \
397 height, p_min_value, p_max_value, mt); \
398 }
399
400 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u8, uint8_t);
401 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s8, int8_t);
402 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t);
403 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t);
404 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t);
405 68 DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float);
406
407 template <typename ScalarType, typename FunctionType>
408 60 kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func,
409 const ScalarType *src, size_t src_stride,
410 size_t width, size_t height,
411 size_t *p_min_offset,
412 size_t *p_max_offset,
413 kleidicv_thread_multithreading mt) {
414 60 std::vector<size_t> min_offsets(height, 0);
415 60 std::vector<size_t> max_offsets(height, 0);
416
417 180 auto callback = [&](unsigned begin, unsigned end) {
418 240 return min_max_loc_func(
419 120 src + begin * (src_stride / sizeof(ScalarType)), src_stride, width,
420
2/2
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
120 end - begin, p_min_offset ? min_offsets.data() + begin : nullptr,
421
2/2
✓ Branch 0 taken 104 times.
✓ Branch 1 taken 16 times.
120 p_max_offset ? max_offsets.data() + begin : nullptr);
422 };
423 60 auto return_val = parallel_batches(callback, mt, height);
424
425
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_min_offset) {
426 52 *p_min_offset = 0;
427
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < min_offsets.size(); ++i) {
428 288 size_t offs = min_offsets[i] + i * src_stride;
429
4/4
✓ Branch 0 taken 239 times.
✓ Branch 1 taken 49 times.
✓ Branch 2 taken 239 times.
✓ Branch 3 taken 49 times.
576 if (src[offs / sizeof(ScalarType)] <
430 288 src[*p_min_offset / sizeof(ScalarType)]) {
431 49 *p_min_offset = offs;
432 49 }
433 288 }
434 52 }
435
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_max_offset) {
436 52 *p_max_offset = 0;
437
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < max_offsets.size(); ++i) {
438 288 size_t offs = max_offsets[i] + i * src_stride;
439
4/4
✓ Branch 0 taken 239 times.
✓ Branch 1 taken 49 times.
✓ Branch 2 taken 239 times.
✓ Branch 3 taken 49 times.
576 if (src[offs / sizeof(ScalarType)] >
440 288 src[*p_max_offset / sizeof(ScalarType)]) {
441 49 *p_max_offset = offs;
442 49 }
443 288 }
444 52 }
445 60 return return_val;
446 60 }
447
448 #define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \
449 kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \
450 const type *src, size_t src_stride, size_t width, size_t height, \
451 size_t *p_min_offset, size_t *p_max_offset, \
452 kleidicv_thread_multithreading mt) { \
453 return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \
454 src_stride, width, height, p_min_offset, \
455 p_max_offset, mt); \
456 }
457
458 60 DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t);
459
460 template <typename F>
461 112 kleidicv_error_t kleidicv_thread_filter(F filter, size_t width, size_t height,
462 size_t channels, size_t kernel_width,
463 size_t kernel_height,
464 kleidicv_filter_context_t *context,
465 kleidicv_thread_multithreading mt) {
466 448 auto callback = [=](unsigned y_begin, unsigned y_end) {
467 // The context contains a buffer that can only fit a single row, so can't be
468 // shared between threads. Since we don't know how many threads there are,
469 // create and destroy a context every time this callback is called. Only use
470 // the context argument for the first thread.
471 336 bool create_context = 0 != y_begin;
472 336 kleidicv_filter_context_t *thread_context = context;
473
8/8
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 28 times.
✓ Branch 3 taken 56 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 56 times.
336 if (create_context) {
474 448 kleidicv_error_t context_create_result = kleidicv_filter_context_create(
475 224 &thread_context, channels, kernel_width, kernel_height, width,
476 224 height);
477 // Excluded from coverage because it's impractical to test this.
478 // MockMallocToFail can't be used because malloc is used in thread setup.
479 // GCOVR_EXCL_START
480 if (KLEIDICV_OK != context_create_result) {
481 return context_create_result;
482 }
483 // GCOVR_EXCL_STOP
484 224 }
485
486 336 kleidicv_error_t result = filter(y_begin, y_end, thread_context);
487
488
8/8
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 56 times.
✓ Branch 2 taken 28 times.
✓ Branch 3 taken 56 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 56 times.
336 if (create_context) {
489 448 kleidicv_error_t context_release_result =
490 224 kleidicv_filter_context_release(thread_context);
491
4/8
✗ Branch 0 not taken.
✓ Branch 1 taken 56 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 56 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 56 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 56 times.
224 if (KLEIDICV_OK == result) {
492 224 result = context_release_result;
493 224 }
494 224 }
495 336 return result;
496 336 };
497 224 return parallel_batches(callback, mt, height);
498 112 }
499
500 208 kleidicv_error_t kleidicv_thread_gaussian_blur_u8(
501 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
502 size_t width, size_t height, size_t channels, size_t kernel_width,
503 size_t kernel_height, float sigma_x, float sigma_y,
504 kleidicv_border_type_t border_type, kleidicv_thread_multithreading mt) {
505 208 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
506
4/4
✓ Branch 0 taken 204 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 172 times.
✓ Branch 3 taken 32 times.
208 if (!fixed_border_type ||
507 408 !kleidicv::gaussian_blur_is_implemented(width, height, kernel_width,
508 204 kernel_height, sigma_x, sigma_y,
509 204 channels, *fixed_border_type)) {
510 176 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
511 }
512
513
4/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 28 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 4 times.
32 if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) {
514 112 auto callback = [=](size_t y_begin, size_t y_end) {
515 168 return kleidicv_gaussian_blur_fixed_stripe_u8(
516 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
517 84 channels, kernel_width, kernel_height, sigma_x, sigma_y,
518 84 *fixed_border_type);
519 };
520 28 return parallel_batches(callback, mt, height);
521 28 }
522 24 auto callback = [=](size_t y_begin, size_t y_end) {
523 40 return kleidicv_gaussian_blur_arbitrary_stripe_u8(
524 20 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
525 20 channels, kernel_width, kernel_height, sigma_x, sigma_y,
526 20 *fixed_border_type);
527 };
528 4 return parallel_batches(callback, mt, height);
529 208 }
530
531 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u8(
532 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
533 size_t width, size_t height, size_t channels, const uint8_t *kernel_x,
534 size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height,
535 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
536 kleidicv_thread_multithreading mt) {
537
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
538 108 kernel_height)) {
539 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
540 }
541
542 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
543
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
544 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
545 }
546
547 112 auto callback = [=](size_t y_begin, size_t y_end,
548 kleidicv_filter_context_t *thread_context) {
549 168 return kleidicv_separable_filter_2d_stripe_u8(
550 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
551 84 channels, kernel_x, kernel_width, kernel_y, kernel_height,
552 84 *fixed_border_type, thread_context);
553 };
554 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
555 28 kernel_height, context, mt);
556 108 }
557
558 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u16(
559 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
560 size_t width, size_t height, size_t channels, const uint16_t *kernel_x,
561 size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height,
562 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
563 kleidicv_thread_multithreading mt) {
564
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
565 108 kernel_height)) {
566 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
567 }
568
569 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
570
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
571 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
572 }
573
574 112 auto callback = [=](size_t y_begin, size_t y_end,
575 kleidicv_filter_context_t *thread_context) {
576 168 return kleidicv_separable_filter_2d_stripe_u16(
577 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
578 84 channels, kernel_x, kernel_width, kernel_y, kernel_height,
579 84 *fixed_border_type, thread_context);
580 };
581 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
582 28 kernel_height, context, mt);
583 108 }
584
585 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_s16(
586 const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
587 size_t width, size_t height, size_t channels, const int16_t *kernel_x,
588 size_t kernel_width, const int16_t *kernel_y, size_t kernel_height,
589 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
590 kleidicv_thread_multithreading mt) {
591
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
592 108 kernel_height)) {
593 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
594 }
595
596 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
597
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
598 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
599 }
600
601 112 auto callback = [=](size_t y_begin, size_t y_end,
602 kleidicv_filter_context_t *thread_context) {
603 168 return kleidicv_separable_filter_2d_stripe_s16(
604 84 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
605 84 channels, kernel_x, kernel_width, kernel_y, kernel_height,
606 84 *fixed_border_type, thread_context);
607 };
608 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
609 28 kernel_height, context, mt);
610 108 }
611
612 108 kleidicv_error_t kleidicv_thread_blur_and_downsample_u8(
613 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
614 uint8_t *dst, size_t dst_stride, size_t channels,
615 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
616 kleidicv_thread_multithreading mt) {
617
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::blur_and_downsample_is_implemented(src_width, src_height,
618 108 channels)) {
619 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
620 }
621
622 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
623
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
624 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
625 }
626
627 112 auto callback = [=](unsigned y_begin, unsigned y_end,
628 kleidicv_filter_context_t *thread_context) {
629 168 return kleidicv_blur_and_downsample_stripe_u8(
630 84 src, src_stride, src_width, src_height, dst, dst_stride, y_begin, y_end,
631 84 channels, *fixed_border_type, thread_context);
632 };
633 56 return kleidicv_thread_filter(callback, src_width, src_height, channels, 5, 5,
634 28 context, mt);
635 108 }
636
637 204 kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8(
638 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
639 size_t width, size_t height, size_t channels,
640 kleidicv_thread_multithreading mt) {
641
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
642 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
643 }
644
645 376 auto callback = [=](unsigned y_begin, unsigned y_end) {
646 528 return kleidicv_sobel_3x3_horizontal_stripe_s16_u8(
647 264 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
648 264 channels);
649 };
650 112 return parallel_batches(callback, mt, height);
651 204 }
652
653 532 kleidicv_error_t kleidicv_thread_median_blur_u8(
654 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
655 size_t width, size_t height, size_t channels, size_t kernel_width,
656 size_t kernel_height, kleidicv_border_type_t border_type,
657 kleidicv_thread_multithreading mt) {
658 1064 auto result_pair = kleidicv::median_blur_is_implemented(
659 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
660 532 kernel_height, border_type);
661
662 532 auto checks_result = result_pair.first;
663 532 auto fixed_border_type = result_pair.second;
664
2/2
✓ Branch 0 taken 416 times.
✓ Branch 1 taken 116 times.
532 if (checks_result != KLEIDICV_OK) {
665 416 return checks_result;
666 }
667
668
2/2
✓ Branch 0 taken 100 times.
✓ Branch 1 taken 16 times.
116 if (kernel_width <= 7) {
669 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
670 560 return kleidicv_median_blur_sorting_network_stripe_u8(
671 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
672 280 channels, kernel_width, kernel_height, fixed_border_type);
673 };
674 100 return parallel_batches(callback, mt, height);
675 100 }
676
677
3/4
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 12 times.
16 if (kernel_width > 7 && kernel_width <= 15) {
678 72 auto callback = [=](unsigned y_begin, unsigned y_end) {
679 120 return kleidicv_median_blur_small_hist_stripe_u8(
680 60 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
681 60 channels, kernel_width, kernel_height, fixed_border_type);
682 };
683 12 return parallel_batches(callback, mt, height);
684 12 }
685
686 24 auto callback = [=](unsigned y_begin, unsigned y_end) {
687 40 return kleidicv_median_blur_large_hist_stripe_u8(
688 20 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
689 20 channels, kernel_width, kernel_height, fixed_border_type);
690 };
691 4 return parallel_batches(callback, mt, height);
692 532 }
693
694 532 kleidicv_error_t kleidicv_thread_median_blur_s16(
695 const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
696 size_t width, size_t height, size_t channels, size_t kernel_width,
697 size_t kernel_height, kleidicv_border_type_t border_type,
698 kleidicv_thread_multithreading mt) {
699 1064 auto result_pair = kleidicv::median_blur_is_implemented(
700 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
701 532 kernel_height, border_type);
702
703 532 auto checks_result = result_pair.first;
704 532 auto fixed_border_type = result_pair.second;
705
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
706 432 return checks_result;
707 }
708
709 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
710 560 return kleidicv_median_blur_sorting_network_stripe_s16(
711 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
712 280 channels, kernel_width, kernel_height, fixed_border_type);
713 };
714 100 return parallel_batches(callback, mt, height);
715 532 }
716
717 532 kleidicv_error_t kleidicv_thread_median_blur_u16(
718 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
719 size_t width, size_t height, size_t channels, size_t kernel_width,
720 size_t kernel_height, kleidicv_border_type_t border_type,
721 kleidicv_thread_multithreading mt) {
722 1064 auto result_pair = kleidicv::median_blur_is_implemented(
723 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
724 532 kernel_height, border_type);
725
726 532 auto checks_result = result_pair.first;
727 532 auto fixed_border_type = result_pair.second;
728
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
729 432 return checks_result;
730 }
731
732 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
733 560 return kleidicv_median_blur_sorting_network_stripe_u16(
734 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
735 280 channels, kernel_width, kernel_height, fixed_border_type);
736 };
737 100 return parallel_batches(callback, mt, height);
738 532 }
739
740 532 kleidicv_error_t kleidicv_thread_median_blur_f32(
741 const float *src, size_t src_stride, float *dst, size_t dst_stride,
742 size_t width, size_t height, size_t channels, size_t kernel_width,
743 size_t kernel_height, kleidicv_border_type_t border_type,
744 kleidicv_thread_multithreading mt) {
745 1064 auto result_pair = kleidicv::median_blur_is_implemented(
746 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
747 532 kernel_height, border_type);
748
749 532 auto checks_result = result_pair.first;
750 532 auto fixed_border_type = result_pair.second;
751
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
752 432 return checks_result;
753 }
754
755 380 auto callback = [=](unsigned y_begin, unsigned y_end) {
756 560 return kleidicv_median_blur_sorting_network_stripe_f32(
757 280 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
758 280 channels, kernel_width, kernel_height, fixed_border_type);
759 };
760 100 return parallel_batches(callback, mt, height);
761 532 }
762
763 204 kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8(
764 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
765 size_t width, size_t height, size_t channels,
766 kleidicv_thread_multithreading mt) {
767
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
768 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
769 }
770
771 376 auto callback = [=](unsigned y_begin, unsigned y_end) {
772 528 return kleidicv_sobel_3x3_vertical_stripe_s16_u8(src, src_stride, dst,
773 264 dst_stride, width, height,
774 264 y_begin, y_end, channels);
775 };
776 112 return parallel_batches(callback, mt, height);
777 204 }
778
779 104 kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8(
780 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
781 size_t src_channels, int16_t *dst, size_t dst_stride,
782 kleidicv_thread_multithreading mt) {
783
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 100 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 100 times.
208 if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height,
784 104 src_channels)) {
785 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
786 }
787
788 316 auto callback = [=](unsigned y_begin, unsigned y_end) {
789 432 return kleidicv_scharr_interleaved_stripe_s16_u8(
790 216 src, src_stride, src_width, src_height, src_channels, dst, dst_stride,
791 216 y_begin, y_end);
792 };
793
794 // height is decremented by 2 as the result has less rows.
795 100 return parallel_batches(callback, mt, src_height - 2);
796 104 }
797
798 120 kleidicv_error_t kleidicv_thread_resize_to_quarter_u8(
799 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
800 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
801 kleidicv_thread_multithreading mt) {
802 350 auto callback = [=](unsigned begin, unsigned end) {
803 230 size_t src_begin = size_t{begin} * 2;
804 230 size_t src_end = std::min<size_t>(src_height, size_t{end} * 2);
805 230 size_t dst_begin = begin;
806 230 size_t dst_end = std::min<size_t>(dst_height, end);
807
808 // half of odd height is rounded towards zero?
809
2/2
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 210 times.
230 if (dst_begin == dst_end) {
810 20 return KLEIDICV_OK;
811 }
812
813 420 return kleidicv_resize_to_quarter_u8(
814 210 src + src_begin * src_stride, src_stride, src_width,
815 210 src_end - src_begin, dst + dst_begin * dst_stride, dst_stride,
816 210 dst_width, dst_end - dst_begin);
817 230 };
818 240 return parallel_batches(callback, mt, (src_height + 1) / 2);
819 120 }
820
821 125 kleidicv_error_t kleidicv_thread_resize_linear_u8(
822 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
823 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
824 kleidicv_thread_multithreading mt) {
825
4/4
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 120 times.
250 if (!kleidicv::resize_linear_u8_is_implemented(src_width, src_height,
826 125 dst_width, dst_height)) {
827 5 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
828 }
829 350 auto callback = [=](unsigned y_begin, unsigned y_end) {
830 460 return kleidicv_resize_linear_stripe_u8(
831 230 src, src_stride, src_width, src_height, y_begin,
832 230 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
833 230 dst_height);
834 };
835 120 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
836 125 }
837
838 185 kleidicv_error_t kleidicv_thread_resize_linear_f32(
839 const float *src, size_t src_stride, size_t src_width, size_t src_height,
840 float *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
841 kleidicv_thread_multithreading mt) {
842
4/4
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 180 times.
370 if (!kleidicv::resize_linear_f32_is_implemented(src_width, src_height,
843 185 dst_width, dst_height)) {
844 5 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
845 }
846 525 auto callback = [=](unsigned y_begin, unsigned y_end) {
847 690 return kleidicv_resize_linear_stripe_f32(
848 345 src, src_stride, src_width, src_height, y_begin,
849 345 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
850 345 dst_height);
851 };
852 180 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
853 185 }
854
855 208 kleidicv_error_t kleidicv_thread_remap_s16_u8(
856 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
857 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
858 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
859 kleidicv_border_type_t border_type, const uint8_t *border_value,
860 kleidicv_thread_multithreading mt) {
861
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint8_t>(src_stride, src_width,
862 208 src_height, dst_width,
863 208 border_type, channels)) {
864 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
865 }
866 632 auto callback = [=](unsigned begin, unsigned end) {
867 864 return kleidicv_remap_s16_u8(
868 432 src, src_stride, src_width, src_height,
869 432 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
870 432 end - begin, channels,
871 432 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
872 432 mapxy_stride, border_type, border_value);
873 };
874 200 return parallel_batches(callback, mt, dst_height);
875 208 }
876
877 208 kleidicv_error_t kleidicv_thread_remap_s16_u16(
878 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
879 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
880 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
881 kleidicv_border_type_t border_type, const uint16_t *border_value,
882 kleidicv_thread_multithreading mt) {
883
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint16_t>(src_stride, src_width,
884 208 src_height, dst_width,
885 208 border_type, channels)) {
886 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
887 }
888 632 auto callback = [=](unsigned begin, unsigned end) {
889 864 return kleidicv_remap_s16_u16(
890 432 src, src_stride, src_width, src_height,
891 432 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
892 432 dst_stride, dst_width, end - begin, channels,
893 432 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
894 432 mapxy_stride, border_type, border_value);
895 };
896 200 return parallel_batches(callback, mt, dst_height);
897 208 }
898
899 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
900 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
901 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
902 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
903 const uint16_t *mapfrac, size_t mapfrac_stride,
904 kleidicv_border_type_t border_type, const uint8_t *border_value,
905 kleidicv_thread_multithreading mt) {
906
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint8_t>(
907 408 src_stride, src_width, src_height, dst_width, border_type,
908 408 channels)) {
909 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
910 }
911 1264 auto callback = [=](unsigned begin, unsigned end) {
912 1728 return kleidicv_remap_s16point5_u8(
913 864 src, src_stride, src_width, src_height,
914 864 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
915 864 end - begin, channels,
916 864 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
917 864 mapxy_stride,
918 1728 mapfrac +
919 864 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
920 864 mapfrac_stride, border_type, border_value);
921 };
922 400 return parallel_batches(callback, mt, dst_height);
923 408 }
924
925 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u16(
926 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
927 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
928 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
929 const uint16_t *mapfrac, size_t mapfrac_stride,
930 kleidicv_border_type_t border_type, const uint16_t *border_value,
931 kleidicv_thread_multithreading mt) {
932
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint16_t>(
933 408 src_stride, src_width, src_height, dst_width, border_type,
934 408 channels)) {
935 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
936 }
937 1264 auto callback = [=](unsigned begin, unsigned end) {
938 1728 return kleidicv_remap_s16point5_u16(
939 864 src, src_stride, src_width, src_height,
940 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
941 864 dst_stride, dst_width, end - begin, channels,
942 864 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
943 864 mapxy_stride,
944 1728 mapfrac +
945 864 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
946 864 mapfrac_stride, border_type, border_value);
947 };
948 400 return parallel_batches(callback, mt, dst_height);
949 408 }
950
951 408 kleidicv_error_t kleidicv_thread_remap_f32_u8(
952 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
953 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
954 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
955 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
956 kleidicv_border_type_t border_type, const uint8_t *border_value,
957 kleidicv_thread_multithreading mt) {
958
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint8_t>(
959 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
960 408 channels, interpolation)) {
961 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
962 }
963 1264 auto callback = [=](unsigned begin, unsigned end) {
964 1728 return kleidicv_remap_f32_u8(
965 864 src, src_stride, src_width, src_height,
966 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)),
967 864 dst_stride, dst_width, end - begin, channels,
968 864 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
969 864 mapx_stride,
970 864 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
971 864 mapy_stride, interpolation, border_type, border_value);
972 };
973 400 return parallel_batches(callback, mt, dst_height);
974 408 }
975
976 408 kleidicv_error_t kleidicv_thread_remap_f32_u16(
977 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
978 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
979 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
980 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
981 kleidicv_border_type_t border_type, const uint16_t *border_value,
982 kleidicv_thread_multithreading mt) {
983
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint16_t>(
984 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
985 408 channels, interpolation)) {
986 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
987 }
988 1264 auto callback = [=](unsigned begin, unsigned end) {
989 1728 return kleidicv_remap_f32_u16(
990 864 src, src_stride, src_width, src_height,
991 864 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
992 864 dst_stride, dst_width, end - begin, channels,
993 864 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
994 864 mapx_stride,
995 864 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
996 864 mapy_stride, interpolation, border_type, border_value);
997 };
998 400 return parallel_batches(callback, mt, dst_height);
999 408 }
1000
1001 216 kleidicv_error_t kleidicv_thread_warp_perspective_u8(
1002 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1003 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1004 const float transformation[9], size_t channels,
1005 kleidicv_interpolation_type_t interpolation,
1006 kleidicv_border_type_t border_type, const uint8_t *border_value,
1007 kleidicv_thread_multithreading mt) {
1008
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 200 times.
216 if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
1009 216 dst_width, channels, interpolation, border_type)) {
1010 16 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1011 }
1012
1013 632 auto callback = [=](unsigned y_begin, unsigned y_end) {
1014 864 return kleidicv_warp_perspective_stripe_u8(
1015 432 src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
1016 432 dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1),
1017 432 transformation, channels, interpolation, border_type, border_value);
1018 };
1019 200 return parallel_batches(callback, mt, dst_height);
1020 216 }
1021