KleidiCV Coverage Report


Directory: ./
File: kleidicv_thread/src/kleidicv_thread.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 638 638 100.0%
Functions: 364 364 100.0%
Branches: 384 396 97.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv_thread/kleidicv_thread.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <cstdint>
10 #include <functional>
11 #include <limits>
12 #include <vector>
13
14 #include "kleidicv/arithmetics/rotate.h"
15 #include "kleidicv/arithmetics/scale.h"
16 #include "kleidicv/conversions/rgb_to_yuv_420.h"
17 #include "kleidicv/conversions/yuv_420_to_rgb.h"
18 #include "kleidicv/ctypes.h"
19 #include "kleidicv/filters/blur_and_downsample.h"
20 #include "kleidicv/filters/gaussian_blur.h"
21 #include "kleidicv/filters/median_blur.h"
22 #include "kleidicv/filters/scharr.h"
23 #include "kleidicv/filters/separable_filter_2d.h"
24 #include "kleidicv/filters/sobel.h"
25 #include "kleidicv/kleidicv.h"
26 #include "kleidicv/resize/resize_linear.h"
27 #include "kleidicv/transform/remap.h"
28 #include "kleidicv/transform/warp_perspective.h"
29
30 typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback;
31
32 16133 static kleidicv_error_t kleidicv_thread_std_function_callback(
33 unsigned task_begin, unsigned task_end, void *data) {
34 16133 auto *callback = reinterpret_cast<FunctionCallback *>(data);
35 32266 return (*callback)(task_begin, task_end);
36 16133 }
37
38 // Operations in the Neon backend have both a vector path and a scalar path.
39 // The vector path is used to process most data and the scalar path is used to
40 // process the parts of the data that don't fit into the vector width.
41 // For floating point operations in particular, the results may be very slightly
42 // different between vector and scalar paths.
43 // When using multithreading, images are divided into parts to be processed by
44 // each thread, and this could change which parts of the data end up being
45 // processed by the vector and scalar paths. Since the threading may be
46 // non-deterministic in how it divides up the image, this non-determinism could
47 // leak through in the values of the output. This could cause subtle bugs.
48 //
49 // To avoid this problem, this function passes data to each thread in batches
50 // that are a multiple of the Neon vector width in size (16 bytes). The
51 // exception is the last batch, which may be longer in order to extend to the
52 // end of the data. No batch can be shorter than vector length as this could
53 // cause different behaviour for operations that try to avoid the tail loop (see
54 // the TryToAvoidTailLoop class) - this technique only works if the data is
55 // longer than vector length.
56 //
57 // Typically with how this function is used, batches will be 16 image rows or
58 // row pairs, which is likely to be far coarser alignment than is needed.
59 // However it's unlikely that threading on a finer-grained level would provide a
60 // performance benefit.
61 template <typename Callback>
62 11904 inline kleidicv_error_t parallel_batches(Callback callback,
63 kleidicv_thread_multithreading mt,
64 unsigned count,
65 unsigned min_batch_size = 16) {
66 11904 const unsigned task_count = std::max(1U, (count) / min_batch_size);
67 28037 FunctionCallback f = [=](unsigned task_begin, unsigned task_end) {
68 16133 unsigned begin = task_begin * min_batch_size,
69 16133 end = task_end * min_batch_size;
70
140/142
✓ Branch 0 taken 792 times.
✓ Branch 1 taken 1800 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 44 times.
✓ Branch 5 taken 100 times.
✓ Branch 6 taken 44 times.
✓ Branch 7 taken 100 times.
✓ Branch 8 taken 44 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 44 times.
✓ Branch 11 taken 100 times.
✓ Branch 12 taken 88 times.
✓ Branch 13 taken 200 times.
✓ Branch 14 taken 44 times.
✓ Branch 15 taken 100 times.
✓ Branch 16 taken 44 times.
✓ Branch 17 taken 112 times.
✓ Branch 18 taken 48 times.
✓ Branch 19 taken 120 times.
✓ Branch 20 taken 44 times.
✓ Branch 21 taken 100 times.
✓ Branch 22 taken 132 times.
✓ Branch 23 taken 300 times.
✓ Branch 24 taken 264 times.
✓ Branch 25 taken 600 times.
✓ Branch 26 taken 132 times.
✓ Branch 27 taken 300 times.
✓ Branch 28 taken 132 times.
✓ Branch 29 taken 300 times.
✓ Branch 30 taken 132 times.
✓ Branch 31 taken 300 times.
✓ Branch 32 taken 88 times.
✓ Branch 33 taken 200 times.
✓ Branch 34 taken 88 times.
✓ Branch 35 taken 200 times.
✓ Branch 36 taken 88 times.
✓ Branch 37 taken 200 times.
✓ Branch 38 taken 44 times.
✓ Branch 39 taken 100 times.
✓ Branch 40 taken 44 times.
✓ Branch 41 taken 100 times.
✓ Branch 42 taken 44 times.
✓ Branch 43 taken 100 times.
✓ Branch 44 taken 44 times.
✓ Branch 45 taken 100 times.
✓ Branch 46 taken 44 times.
✓ Branch 47 taken 100 times.
✓ Branch 48 taken 44 times.
✓ Branch 49 taken 100 times.
✓ Branch 50 taken 96 times.
✓ Branch 51 taken 160 times.
✓ Branch 52 taken 5 times.
✓ Branch 53 taken 135 times.
✓ Branch 54 taken 5 times.
✓ Branch 55 taken 135 times.
✓ Branch 56 taken 5 times.
✓ Branch 57 taken 135 times.
✓ Branch 58 taken 5 times.
✓ Branch 59 taken 135 times.
✓ Branch 60 taken 5 times.
✓ Branch 61 taken 70 times.
✓ Branch 62 taken 5 times.
✓ Branch 63 taken 70 times.
✓ Branch 64 taken 5 times.
✓ Branch 65 taken 70 times.
✓ Branch 66 taken 5 times.
✓ Branch 67 taken 70 times.
✓ Branch 68 taken 5 times.
✓ Branch 69 taken 70 times.
✓ Branch 70 taken 5 times.
✓ Branch 71 taken 70 times.
✓ Branch 72 taken 5 times.
✓ Branch 73 taken 70 times.
✓ Branch 74 taken 5 times.
✓ Branch 75 taken 70 times.
✗ Branch 76 not taken.
✓ Branch 77 taken 760 times.
✓ Branch 78 taken 4 times.
✓ Branch 79 taken 60 times.
✓ Branch 80 taken 4 times.
✓ Branch 81 taken 60 times.
✓ Branch 82 taken 4 times.
✓ Branch 83 taken 60 times.
✓ Branch 84 taken 4 times.
✓ Branch 85 taken 60 times.
✓ Branch 86 taken 4 times.
✓ Branch 87 taken 60 times.
✓ Branch 88 taken 4 times.
✓ Branch 89 taken 68 times.
✓ Branch 90 taken 4 times.
✓ Branch 91 taken 60 times.
✓ Branch 92 taken 16 times.
✓ Branch 93 taken 28 times.
✓ Branch 94 taken 4 times.
✓ Branch 95 taken 4 times.
✓ Branch 96 taken 16 times.
✓ Branch 97 taken 28 times.
✓ Branch 98 taken 16 times.
✓ Branch 99 taken 28 times.
✓ Branch 100 taken 16 times.
✓ Branch 101 taken 28 times.
✓ Branch 102 taken 16 times.
✓ Branch 103 taken 28 times.
✓ Branch 104 taken 48 times.
✓ Branch 105 taken 112 times.
✓ Branch 106 taken 52 times.
✓ Branch 107 taken 100 times.
✓ Branch 108 taken 12 times.
✓ Branch 109 taken 12 times.
✓ Branch 110 taken 4 times.
✓ Branch 111 taken 4 times.
✓ Branch 112 taken 52 times.
✓ Branch 113 taken 100 times.
✓ Branch 114 taken 52 times.
✓ Branch 115 taken 100 times.
✓ Branch 116 taken 52 times.
✓ Branch 117 taken 100 times.
✓ Branch 118 taken 48 times.
✓ Branch 119 taken 112 times.
✓ Branch 120 taken 44 times.
✓ Branch 121 taken 100 times.
✗ Branch 122 not taken.
✓ Branch 123 taken 120 times.
✓ Branch 124 taken 10 times.
✓ Branch 125 taken 120 times.
✓ Branch 126 taken 15 times.
✓ Branch 127 taken 180 times.
✓ Branch 128 taken 88 times.
✓ Branch 129 taken 200 times.
✓ Branch 130 taken 88 times.
✓ Branch 131 taken 200 times.
✓ Branch 132 taken 176 times.
✓ Branch 133 taken 400 times.
✓ Branch 134 taken 176 times.
✓ Branch 135 taken 400 times.
✓ Branch 136 taken 176 times.
✓ Branch 137 taken 400 times.
✓ Branch 138 taken 176 times.
✓ Branch 139 taken 400 times.
✓ Branch 140 taken 88 times.
✓ Branch 141 taken 200 times.
16133 if (task_end == task_count) {
71 11904 end = count;
72 11904 }
73 32266 return callback(begin, end);
74 16133 };
75 23808 return mt.parallel(kleidicv_thread_std_function_callback, &f,
76 11904 mt.parallel_data, task_count);
77 11904 }
78
79 template <typename SrcT, typename DstT, typename F, typename... Args>
80 2952 inline kleidicv_error_t kleidicv_thread_unary_op_impl(
81 F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride,
82 DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) {
83 7188 auto callback = [=](unsigned begin, unsigned end) {
84 8472 return f(src + static_cast<ptrdiff_t>(begin * src_stride / sizeof(SrcT)),
85 4236 src_stride,
86 4236 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
87 4236 dst_stride, width, end - begin, args...);
88 };
89 5904 return parallel_batches(callback, mt, height);
90 2952 }
91
92 template <typename SrcT, typename DstT, typename F, typename... Args>
93 3000 inline kleidicv_error_t kleidicv_thread_binary_op_impl(
94 F f, kleidicv_thread_multithreading mt, const SrcT *src_a,
95 size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst,
96 size_t dst_stride, size_t width, size_t height, Args... args) {
97 7320 auto callback = [=](unsigned begin, unsigned end) {
98 8640 return f(
99 4320 src_a + static_cast<ptrdiff_t>(begin * src_a_stride / sizeof(SrcT)),
100 4320 src_a_stride,
101 4320 src_b + static_cast<ptrdiff_t>(begin * src_b_stride / sizeof(SrcT)),
102 4320 src_b_stride,
103 4320 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)),
104 4320 dst_stride, width, end - begin, args...);
105 };
106 6000 return parallel_batches(callback, mt, height);
107 3000 }
108
109 #define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \
110 kleidicv_error_t kleidicv_thread_##suffix( \
111 const src_type *src, size_t src_stride, dst_type *dst, \
112 size_t dst_stride, size_t width, size_t height, \
113 kleidicv_thread_multithreading mt) { \
114 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
115 src_stride, dst, dst_stride, width, \
116 height); \
117 }
118
119 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t);
120 100 KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t);
121 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t);
122 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t);
123 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t);
124 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t);
125 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t);
126 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t);
127 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t);
128 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t);
129 100 KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgr_u8, uint8_t, uint8_t);
130 100 KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgra_u8, uint8_t, uint8_t);
131 100 KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgb_u8, uint8_t, uint8_t);
132 100 KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgba_u8, uint8_t, uint8_t);
133 100 KLEIDICV_THREAD_UNARY_OP_IMPL(bgr_to_yuv_u8, uint8_t, uint8_t);
134 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_yuv_u8, uint8_t, uint8_t);
135 100 KLEIDICV_THREAD_UNARY_OP_IMPL(bgra_to_yuv_u8, uint8_t, uint8_t);
136 100 KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_yuv_u8, uint8_t, uint8_t);
137 120 KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float);
138 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_s8, float, int8_t);
139 100 KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_u8, float, uint8_t);
140 100 KLEIDICV_THREAD_UNARY_OP_IMPL(s8_to_f32, int8_t, float);
141 100 KLEIDICV_THREAD_UNARY_OP_IMPL(u8_to_f32, uint8_t, float);
142
143 #define KLEIDICV_THREAD_INRANGE_OP_IMPL(suffix, src_type, dst_type) \
144 kleidicv_error_t kleidicv_thread_##suffix( \
145 const src_type *src, size_t src_stride, dst_type *dst, \
146 size_t dst_stride, size_t width, size_t height, src_type lower_bound, \
147 src_type upper_bound, kleidicv_thread_multithreading mt) { \
148 return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \
149 src_stride, dst, dst_stride, width, \
150 height, lower_bound, upper_bound); \
151 }
152
153 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_u8, uint8_t, uint8_t);
154 100 KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_f32, float, uint8_t);
155
156 100 kleidicv_error_t kleidicv_thread_threshold_binary_u8(
157 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
158 size_t width, size_t height, uint8_t threshold, uint8_t value,
159 kleidicv_thread_multithreading mt) {
160 200 return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src,
161 100 src_stride, dst, dst_stride, width,
162 100 height, threshold, value);
163 }
164
165 124 kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride,
166 uint8_t *dst, size_t dst_stride,
167 size_t width, size_t height,
168 double scale, double shift,
169 kleidicv_thread_multithreading mt) {
170
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 120 times.
124 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
171
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 116 times.
120 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
172
5/6
✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 112 times.
116 CHECK_IMAGE_SIZE(width, height);
173
174 112 const std::array<uint8_t, 256> precalculated_table =
175 112 kleidicv::neon::precalculate_scale_table_u8(scale, shift);
176 112 return kleidicv_thread_unary_op_impl(
177 112 kleidicv::neon::scale_with_precalculated_table_u8, mt, src, src_stride,
178 112 dst, dst_stride, width, height, scale, shift, precalculated_table);
179 124 }
180
181 120 kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride,
182 float *dst, size_t dst_stride,
183 size_t width, size_t height,
184 double scale, double shift,
185 kleidicv_thread_multithreading mt) {
186 240 return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride,
187 120 dst, dst_stride, width, height, scale,
188 120 shift);
189 }
190
191 100 kleidicv_error_t kleidicv_thread_scale_u8_f16(
192 const uint8_t *src, size_t src_stride, float16_t *dst, size_t dst_stride,
193 size_t width, size_t height, double scale, double shift,
194 kleidicv_thread_multithreading mt) {
195 200 return kleidicv_thread_unary_op_impl(kleidicv_scale_u8_f16, mt, src,
196 100 src_stride, dst, dst_stride, width,
197 100 height, scale, shift);
198 }
199
200 #define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \
201 kleidicv_error_t kleidicv_thread_##suffix( \
202 const type *src_a, size_t src_a_stride, const type *src_b, \
203 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
204 size_t height, kleidicv_thread_multithreading mt) { \
205 return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \
206 src_a_stride, src_b, src_b_stride, \
207 dst, dst_stride, width, height); \
208 }
209
210 #define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \
211 kleidicv_error_t kleidicv_thread_##suffix( \
212 const type *src_a, size_t src_a_stride, const type *src_b, \
213 size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \
214 size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \
215 return kleidicv_thread_binary_op_impl( \
216 kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \
217 dst_stride, width, height, scale); \
218 }
219
220 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t);
221 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t);
222 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t);
223 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t);
224 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t);
225 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t);
226 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t);
227 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t);
228 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t);
229 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t);
230 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t);
231 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t);
232 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t);
233 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t);
234 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t);
235 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t);
236 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t);
237 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t);
238 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t);
239 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t);
240 100 KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t);
241 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double);
242 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double);
243 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double);
244 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double);
245 100 KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double);
246 100 KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t);
247 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t);
248 100 KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t);
249
250 100 kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16(
251 const int16_t *src_a, size_t src_a_stride, const int16_t *src_b,
252 size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width,
253 size_t height, int16_t threshold, kleidicv_thread_multithreading mt) {
254 100 return kleidicv_thread_binary_op_impl(
255 100 kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride,
256 100 src_b, src_b_stride, dst, dst_stride, width, height, threshold);
257 }
258
259 172 kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride,
260 size_t width, size_t height, void *dst,
261 size_t dst_stride, int angle,
262 size_t element_size,
263 kleidicv_thread_multithreading mt) {
264
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 160 times.
172 if (!kleidicv::rotate_is_implemented(src, dst, angle, element_size)) {
265 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
266 }
267 // reading in columns and writing out rows tends to perform better
268 416 auto callback = [=](unsigned begin, unsigned end) {
269 512 return kleidicv_rotate(
270 256 static_cast<const uint8_t *>(src) + begin * element_size, src_stride,
271 256 end - begin, height, static_cast<uint8_t *>(dst) + begin * dst_stride,
272 256 dst_stride, angle, element_size);
273 };
274 160 return parallel_batches(callback, mt, width, 64);
275 172 }
276
277 135 kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8(
278 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
279 size_t width, size_t height, bool is_yv12,
280 kleidicv_thread_multithreading mt) {
281 275 auto callback = [=](unsigned begin, unsigned end) {
282 280 return kleidicv_yuv_p_to_bgr_stripe_u8(
283 140 src, src_stride, dst, dst_stride, width, height, is_yv12,
284 140 static_cast<size_t>(begin), static_cast<size_t>(end));
285 };
286 270 return parallel_batches(callback, mt, (height + 1) / 2);
287 135 }
288
289 135 kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8(
290 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
291 size_t width, size_t height, bool is_yv12,
292 kleidicv_thread_multithreading mt) {
293 275 auto callback = [=](unsigned begin, unsigned end) {
294 280 return kleidicv_yuv_p_to_bgra_stripe_u8(
295 140 src, src_stride, dst, dst_stride, width, height, is_yv12,
296 140 static_cast<size_t>(begin), static_cast<size_t>(end));
297 };
298 270 return parallel_batches(callback, mt, (height + 1) / 2);
299 135 }
300
301 135 kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8(
302 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
303 size_t width, size_t height, bool is_yv12,
304 kleidicv_thread_multithreading mt) {
305 275 auto callback = [=](unsigned begin, unsigned end) {
306 280 return kleidicv_yuv_p_to_rgb_stripe_u8(
307 140 src, src_stride, dst, dst_stride, width, height, is_yv12,
308 140 static_cast<size_t>(begin), static_cast<size_t>(end));
309 };
310 270 return parallel_batches(callback, mt, (height + 1) / 2);
311 135 }
312
313 135 kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8(
314 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
315 size_t width, size_t height, bool is_yv12,
316 kleidicv_thread_multithreading mt) {
317 275 auto callback = [=](unsigned begin, unsigned end) {
318 280 return kleidicv_yuv_p_to_rgba_stripe_u8(
319 140 src, src_stride, dst, dst_stride, width, height, is_yv12,
320 140 static_cast<size_t>(begin), static_cast<size_t>(end));
321 };
322 270 return parallel_batches(callback, mt, (height + 1) / 2);
323 135 }
324
325 70 kleidicv_error_t kleidicv_thread_rgb_to_yuv420_p_u8(
326 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
327 size_t width, size_t height, bool is_yv12,
328 kleidicv_thread_multithreading mt) {
329 145 auto callback = [=](unsigned begin, unsigned end) {
330 150 return kleidicv_rgb_to_yuv420_p_stripe_u8(
331 75 src, src_stride, dst, dst_stride, width, height, is_yv12,
332 75 static_cast<size_t>(begin), static_cast<size_t>(end));
333 };
334 140 return parallel_batches(callback, mt, (height + 1) / 2);
335 70 }
336
337 70 kleidicv_error_t kleidicv_thread_rgba_to_yuv420_p_u8(
338 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
339 size_t width, size_t height, bool is_yv12,
340 kleidicv_thread_multithreading mt) {
341 145 auto callback = [=](unsigned begin, unsigned end) {
342 150 return kleidicv_rgba_to_yuv420_p_stripe_u8(
343 75 src, src_stride, dst, dst_stride, width, height, is_yv12,
344 75 static_cast<size_t>(begin), static_cast<size_t>(end));
345 };
346 140 return parallel_batches(callback, mt, (height + 1) / 2);
347 70 }
348
349 70 kleidicv_error_t kleidicv_thread_bgr_to_yuv420_p_u8(
350 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
351 size_t width, size_t height, bool is_yv12,
352 kleidicv_thread_multithreading mt) {
353 145 auto callback = [=](unsigned begin, unsigned end) {
354 150 return kleidicv_bgr_to_yuv420_p_stripe_u8(
355 75 src, src_stride, dst, dst_stride, width, height, is_yv12,
356 75 static_cast<size_t>(begin), static_cast<size_t>(end));
357 };
358 140 return parallel_batches(callback, mt, (height + 1) / 2);
359 70 }
360
361 70 kleidicv_error_t kleidicv_thread_bgra_to_yuv420_p_u8(
362 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
363 size_t width, size_t height, bool is_yv12,
364 kleidicv_thread_multithreading mt) {
365 145 auto callback = [=](unsigned begin, unsigned end) {
366 150 return kleidicv_bgra_to_yuv420_p_stripe_u8(
367 75 src, src_stride, dst, dst_stride, width, height, is_yv12,
368 75 static_cast<size_t>(begin), static_cast<size_t>(end));
369 };
370 140 return parallel_batches(callback, mt, (height + 1) / 2);
371 70 }
372
373 70 kleidicv_error_t kleidicv_thread_rgb_to_yuv420_sp_u8(
374 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
375 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
376 bool is_nv21, kleidicv_thread_multithreading mt) {
377 145 auto callback = [=](unsigned begin, unsigned end) {
378 150 return kleidicv_rgb_to_yuv420_sp_stripe_u8(
379 75 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
380 75 is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end));
381 };
382 140 return parallel_batches(callback, mt, (height + 1) / 2);
383 70 }
384
385 70 kleidicv_error_t kleidicv_thread_rgba_to_yuv420_sp_u8(
386 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
387 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
388 bool is_nv21, kleidicv_thread_multithreading mt) {
389 145 auto callback = [=](unsigned begin, unsigned end) {
390 150 return kleidicv_rgba_to_yuv420_sp_stripe_u8(
391 75 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
392 75 is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end));
393 };
394 140 return parallel_batches(callback, mt, (height + 1) / 2);
395 70 }
396
397 70 kleidicv_error_t kleidicv_thread_bgr_to_yuv420_sp_u8(
398 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
399 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
400 bool is_nv21, kleidicv_thread_multithreading mt) {
401 145 auto callback = [=](unsigned begin, unsigned end) {
402 150 return kleidicv_bgr_to_yuv420_sp_stripe_u8(
403 75 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
404 75 is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end));
405 };
406 140 return parallel_batches(callback, mt, (height + 1) / 2);
407 70 }
408
409 70 kleidicv_error_t kleidicv_thread_bgra_to_yuv420_sp_u8(
410 const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride,
411 uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height,
412 bool is_nv21, kleidicv_thread_multithreading mt) {
413 145 auto callback = [=](unsigned begin, unsigned end) {
414 150 return kleidicv_bgra_to_yuv420_sp_stripe_u8(
415 75 src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height,
416 75 is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end));
417 };
418 140 return parallel_batches(callback, mt, (height + 1) / 2);
419 70 }
420
421 template <typename F>
422 760 inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl(
423 F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv,
424 size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width,
425 size_t height, bool is_nv21, kleidicv_thread_multithreading mt) {
426 1520 auto callback = [=](unsigned begin, unsigned end) {
427 760 size_t row_begin = size_t{begin} * 2;
428 760 size_t row_end = std::min<size_t>(height, size_t{end} * 2);
429 760 size_t row_uv = begin;
430 2280 return f(src_y + row_begin * src_y_stride, src_y_stride,
431 760 src_uv + row_uv * src_uv_stride, src_uv_stride,
432 760 dst + row_begin * dst_stride, dst_stride, width,
433 760 row_end - row_begin, is_nv21);
434 760 };
435 1520 return parallel_batches(callback, mt, (height + 1) / 2);
436 760 }
437
438 #define YUV_SP_TO_RGB(suffix) \
439 kleidicv_error_t kleidicv_thread_##suffix( \
440 const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, \
441 size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, \
442 size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { \
443 return kleidicv_thread_yuv_sp_to_rgb_u8_impl( \
444 kleidicv_##suffix, src_y, src_y_stride, src_uv, src_uv_stride, dst, \
445 dst_stride, width, height, is_nv21, mt); \
446 }
447
448 190 YUV_SP_TO_RGB(yuv_sp_to_bgr_u8);
449 190 YUV_SP_TO_RGB(yuv_sp_to_bgra_u8);
450 190 YUV_SP_TO_RGB(yuv_sp_to_rgb_u8);
451 190 YUV_SP_TO_RGB(yuv_sp_to_rgba_u8);
452
453 template <typename ScalarType, typename FunctionType>
454 368 kleidicv_error_t parallel_min_max(FunctionType min_max_func,
455 const ScalarType *src, size_t src_stride,
456 size_t width, size_t height,
457 ScalarType *p_min_value,
458 ScalarType *p_max_value,
459 kleidicv_thread_multithreading mt) {
460 736 std::vector<ScalarType> min_values(height,
461 368 std::numeric_limits<ScalarType>::max());
462 736 std::vector<ScalarType> max_values(height,
463 368 std::numeric_limits<ScalarType>::lowest());
464
465 760 auto callback = [&](unsigned begin, unsigned end) {
466 784 return min_max_func(src + begin * (src_stride / sizeof(ScalarType)),
467 392 src_stride, width, end - begin,
468
12/12
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 56 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 56 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 64 times.
✓ Branch 11 taken 8 times.
392 p_min_value ? min_values.data() + begin : nullptr,
469
12/12
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 56 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 56 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 64 times.
✓ Branch 11 taken 8 times.
392 p_max_value ? max_values.data() + begin : nullptr);
470 };
471
472 368 auto return_val = parallel_batches(callback, mt, height);
473
474
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_min_value) {
475 320 *p_min_value = std::numeric_limits<ScalarType>::max();
476
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : min_values) {
477
12/12
✓ Branch 0 taken 235 times.
✓ Branch 1 taken 53 times.
✓ Branch 2 taken 235 times.
✓ Branch 3 taken 53 times.
✓ Branch 4 taken 233 times.
✓ Branch 5 taken 55 times.
✓ Branch 6 taken 233 times.
✓ Branch 7 taken 55 times.
✓ Branch 8 taken 233 times.
✓ Branch 9 taken 55 times.
✓ Branch 10 taken 306 times.
✓ Branch 11 taken 62 times.
1808 if (m < *p_min_value) {
478 333 *p_min_value = m;
479 333 }
480 1808 }
481 320 }
482
12/12
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
368 if (p_max_value) {
483 320 *p_max_value = std::numeric_limits<ScalarType>::lowest();
484
12/12
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
2128 for (ScalarType m : max_values) {
485
12/12
✓ Branch 0 taken 234 times.
✓ Branch 1 taken 54 times.
✓ Branch 2 taken 234 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 232 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 232 times.
✓ Branch 7 taken 56 times.
✓ Branch 8 taken 232 times.
✓ Branch 9 taken 56 times.
✓ Branch 10 taken 307 times.
✓ Branch 11 taken 61 times.
1808 if (m > *p_max_value) {
486 337 *p_max_value = m;
487 337 }
488 1808 }
489 320 }
490 368 return return_val;
491 368 }
492
493 #define DEFINE_KLEIDICV_THREAD_MIN_MAX(suffix, type) \
494 kleidicv_error_t kleidicv_thread_min_max_##suffix( \
495 const type *src, size_t src_stride, size_t width, size_t height, \
496 type *p_min_value, type *p_max_value, \
497 kleidicv_thread_multithreading mt) { \
498 return parallel_min_max(kleidicv_min_max_##suffix, src, src_stride, width, \
499 height, p_min_value, p_max_value, mt); \
500 }
501
502 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u8, uint8_t);
503 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s8, int8_t);
504 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t);
505 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t);
506 60 DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t);
507 68 DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float);
508
509 template <typename ScalarType, typename FunctionType>
510 60 kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func,
511 const ScalarType *src, size_t src_stride,
512 size_t width, size_t height,
513 size_t *p_min_offset,
514 size_t *p_max_offset,
515 kleidicv_thread_multithreading mt) {
516 60 std::vector<size_t> min_offsets(height, 0);
517 60 std::vector<size_t> max_offsets(height, 0);
518
519 124 auto callback = [&](unsigned begin, unsigned end) {
520 128 return min_max_loc_func(
521 64 src + begin * (src_stride / sizeof(ScalarType)), src_stride, width,
522
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
64 end - begin, p_min_offset ? min_offsets.data() + begin : nullptr,
523
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
64 p_max_offset ? max_offsets.data() + begin : nullptr);
524 };
525 60 auto return_val = parallel_batches(callback, mt, height);
526
527
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_min_offset) {
528 52 *p_min_offset = 0;
529
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < min_offsets.size(); ++i) {
530 288 size_t offs = min_offsets[i] + i * src_stride;
531
4/4
✓ Branch 0 taken 246 times.
✓ Branch 1 taken 42 times.
✓ Branch 2 taken 246 times.
✓ Branch 3 taken 42 times.
576 if (src[offs / sizeof(ScalarType)] <
532 288 src[*p_min_offset / sizeof(ScalarType)]) {
533 42 *p_min_offset = offs;
534 42 }
535 288 }
536 52 }
537
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
60 if (p_max_offset) {
538 52 *p_max_offset = 0;
539
2/2
✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
340 for (size_t i = 0; i < max_offsets.size(); ++i) {
540 288 size_t offs = max_offsets[i] + i * src_stride;
541
4/4
✓ Branch 0 taken 255 times.
✓ Branch 1 taken 33 times.
✓ Branch 2 taken 255 times.
✓ Branch 3 taken 33 times.
576 if (src[offs / sizeof(ScalarType)] >
542 288 src[*p_max_offset / sizeof(ScalarType)]) {
543 33 *p_max_offset = offs;
544 33 }
545 288 }
546 52 }
547 60 return return_val;
548 60 }
549
550 #define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \
551 kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \
552 const type *src, size_t src_stride, size_t width, size_t height, \
553 size_t *p_min_offset, size_t *p_max_offset, \
554 kleidicv_thread_multithreading mt) { \
555 return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \
556 src_stride, width, height, p_min_offset, \
557 p_max_offset, mt); \
558 }
559
560 60 DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t);
561
562 template <typename F>
563 144 kleidicv_error_t kleidicv_thread_filter(F filter, size_t width, size_t height,
564 size_t channels, size_t kernel_width,
565 size_t kernel_height,
566 kleidicv_filter_context_t *context,
567 kleidicv_thread_multithreading mt) {
568 372 auto callback = [=](unsigned y_begin, unsigned y_end) {
569 // The context contains a buffer that can only fit a single row, so can't be
570 // shared between threads. Since we don't know how many threads there are,
571 // create and destroy a context every time this callback is called. Only use
572 // the context argument for the first thread.
573 228 bool create_context = 0 != y_begin;
574 228 kleidicv_filter_context_t *thread_context = context;
575
12/12
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 28 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 16 times.
228 if (create_context) {
576 168 kleidicv_error_t context_create_result = kleidicv_filter_context_create(
577 84 &thread_context, channels, kernel_width, kernel_height, width,
578 84 height);
579 // Excluded from coverage because it's impractical to test this.
580 // MockMallocToFail can't be used because malloc is used in thread setup.
581 // GCOVR_EXCL_START
582 if (KLEIDICV_OK != context_create_result) {
583 return context_create_result;
584 }
585 // GCOVR_EXCL_STOP
586 84 }
587
588 228 kleidicv_error_t result = filter(y_begin, y_end, thread_context);
589
590
12/12
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 28 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 16 times.
228 if (create_context) {
591 168 kleidicv_error_t context_release_result =
592 84 kleidicv_filter_context_release(thread_context);
593
6/12
✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 16 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 16 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 16 times.
84 if (KLEIDICV_OK == result) {
594 84 result = context_release_result;
595 84 }
596 84 }
597 228 return result;
598 228 };
599 288 return parallel_batches(callback, mt, height);
600 144 }
601
602 208 kleidicv_error_t kleidicv_thread_gaussian_blur_u8(
603 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
604 size_t width, size_t height, size_t channels, size_t kernel_width,
605 size_t kernel_height, float sigma_x, float sigma_y,
606 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
607 kleidicv_thread_multithreading mt) {
608 208 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
609
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 204 times.
208 if (!fixed_border_type) {
610 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
611 }
612
613
4/4
✓ Branch 0 taken 32 times.
✓ Branch 1 taken 172 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 172 times.
408 if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_width,
614 204 kernel_height, sigma_x, sigma_y,
615 204 channels, *fixed_border_type)) {
616 172 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
617 }
618
619
4/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 28 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 4 times.
32 if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) {
620 72 auto callback = [=](size_t y_begin, size_t y_end,
621 kleidicv_filter_context_t *thread_context) {
622 88 return kleidicv_gaussian_blur_fixed_stripe_u8(
623 44 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
624 44 channels, kernel_width, kernel_height, sigma_x, sigma_y,
625 44 *fixed_border_type, thread_context);
626 };
627 56 return kleidicv_thread_filter(callback, width, height, channels,
628 28 kernel_width, kernel_height, context, mt);
629 28 }
630
631 12 auto callback = [=](size_t y_begin, size_t y_end,
632 kleidicv_filter_context_t *thread_context) {
633 16 return kleidicv_gaussian_blur_arbitrary_stripe_u8(
634 8 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
635 8 channels, kernel_width, kernel_height, sigma_x, sigma_y,
636 8 *fixed_border_type, thread_context);
637 };
638 8 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
639 4 kernel_height, context, mt);
640 208 }
641
642 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u8(
643 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
644 size_t width, size_t height, size_t channels, const uint8_t *kernel_x,
645 size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height,
646 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
647 kleidicv_thread_multithreading mt) {
648
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
649 108 kernel_height)) {
650 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
651 }
652
653 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
654
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
655 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
656 }
657
658 72 auto callback = [=](size_t y_begin, size_t y_end,
659 kleidicv_filter_context_t *thread_context) {
660 88 return kleidicv_separable_filter_2d_stripe_u8(
661 44 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
662 44 channels, kernel_x, kernel_width, kernel_y, kernel_height,
663 44 *fixed_border_type, thread_context);
664 };
665 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
666 28 kernel_height, context, mt);
667 108 }
668
669 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_u16(
670 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
671 size_t width, size_t height, size_t channels, const uint16_t *kernel_x,
672 size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height,
673 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
674 kleidicv_thread_multithreading mt) {
675
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
676 108 kernel_height)) {
677 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
678 }
679
680 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
681
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
682 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
683 }
684
685 72 auto callback = [=](size_t y_begin, size_t y_end,
686 kleidicv_filter_context_t *thread_context) {
687 88 return kleidicv_separable_filter_2d_stripe_u16(
688 44 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
689 44 channels, kernel_x, kernel_width, kernel_y, kernel_height,
690 44 *fixed_border_type, thread_context);
691 };
692 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
693 28 kernel_height, context, mt);
694 108 }
695
696 108 kleidicv_error_t kleidicv_thread_separable_filter_2d_s16(
697 const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
698 size_t width, size_t height, size_t channels, const int16_t *kernel_x,
699 size_t kernel_width, const int16_t *kernel_y, size_t kernel_height,
700 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
701 kleidicv_thread_multithreading mt) {
702
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width,
703 108 kernel_height)) {
704 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
705 }
706
707 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
708
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
709 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
710 }
711
712 72 auto callback = [=](size_t y_begin, size_t y_end,
713 kleidicv_filter_context_t *thread_context) {
714 88 return kleidicv_separable_filter_2d_stripe_s16(
715 44 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
716 44 channels, kernel_x, kernel_width, kernel_y, kernel_height,
717 44 *fixed_border_type, thread_context);
718 };
719 56 return kleidicv_thread_filter(callback, width, height, channels, kernel_width,
720 28 kernel_height, context, mt);
721 108 }
722
723 108 kleidicv_error_t kleidicv_thread_blur_and_downsample_u8(
724 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
725 uint8_t *dst, size_t dst_stride, size_t channels,
726 kleidicv_border_type_t border_type, kleidicv_filter_context_t *context,
727 kleidicv_thread_multithreading mt) {
728
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
216 if (!kleidicv::blur_and_downsample_is_implemented(src_width, src_height,
729 108 channels)) {
730 76 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
731 }
732
733 32 auto fixed_border_type = kleidicv::get_fixed_border_type(border_type);
734
2/2
✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
32 if (!fixed_border_type) {
735 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
736 }
737
738 72 auto callback = [=](unsigned y_begin, unsigned y_end,
739 kleidicv_filter_context_t *thread_context) {
740 88 return kleidicv_blur_and_downsample_stripe_u8(
741 44 src, src_stride, src_width, src_height, dst, dst_stride, y_begin, y_end,
742 44 channels, *fixed_border_type, thread_context);
743 };
744 56 return kleidicv_thread_filter(callback, src_width, src_height, channels, 5, 5,
745 28 context, mt);
746 108 }
747
748 204 kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8(
749 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
750 size_t width, size_t height, size_t channels,
751 kleidicv_thread_multithreading mt) {
752
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
753 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
754 }
755
756 272 auto callback = [=](unsigned y_begin, unsigned y_end) {
757 320 return kleidicv_sobel_3x3_horizontal_stripe_s16_u8(
758 160 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
759 160 channels);
760 };
761 112 return parallel_batches(callback, mt, height);
762 204 }
763
764 532 kleidicv_error_t kleidicv_thread_median_blur_u8(
765 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
766 size_t width, size_t height, size_t channels, size_t kernel_width,
767 size_t kernel_height, kleidicv_border_type_t border_type,
768 kleidicv_thread_multithreading mt) {
769 1064 auto result_pair = kleidicv::median_blur_is_implemented(
770 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
771 532 kernel_height, border_type);
772
773 532 auto checks_result = result_pair.first;
774 532 auto fixed_border_type = result_pair.second;
775
2/2
✓ Branch 0 taken 416 times.
✓ Branch 1 taken 116 times.
532 if (checks_result != KLEIDICV_OK) {
776 416 return checks_result;
777 }
778
779
2/2
✓ Branch 0 taken 100 times.
✓ Branch 1 taken 16 times.
116 if (kernel_width <= 7) {
780 252 auto callback = [=](unsigned y_begin, unsigned y_end) {
781 304 return kleidicv_median_blur_sorting_network_stripe_u8(
782 152 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
783 152 channels, kernel_width, kernel_height, fixed_border_type);
784 };
785 100 return parallel_batches(callback, mt, height);
786 100 }
787
788
3/4
✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 12 times.
16 if (kernel_width > 7 && kernel_width <= 15) {
789 36 auto callback = [=](unsigned y_begin, unsigned y_end) {
790 48 return kleidicv_median_blur_small_hist_stripe_u8(
791 24 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
792 24 channels, kernel_width, kernel_height, fixed_border_type);
793 };
794 12 return parallel_batches(callback, mt, height);
795 12 }
796
797 12 auto callback = [=](unsigned y_begin, unsigned y_end) {
798 16 return kleidicv_median_blur_large_hist_stripe_u8(
799 8 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
800 8 channels, kernel_width, kernel_height, fixed_border_type);
801 };
802 4 return parallel_batches(callback, mt, height);
803 532 }
804
805 532 kleidicv_error_t kleidicv_thread_median_blur_s16(
806 const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
807 size_t width, size_t height, size_t channels, size_t kernel_width,
808 size_t kernel_height, kleidicv_border_type_t border_type,
809 kleidicv_thread_multithreading mt) {
810 1064 auto result_pair = kleidicv::median_blur_is_implemented(
811 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
812 532 kernel_height, border_type);
813
814 532 auto checks_result = result_pair.first;
815 532 auto fixed_border_type = result_pair.second;
816
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
817 432 return checks_result;
818 }
819
820 252 auto callback = [=](unsigned y_begin, unsigned y_end) {
821 304 return kleidicv_median_blur_sorting_network_stripe_s16(
822 152 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
823 152 channels, kernel_width, kernel_height, fixed_border_type);
824 };
825 100 return parallel_batches(callback, mt, height);
826 532 }
827
828 532 kleidicv_error_t kleidicv_thread_median_blur_u16(
829 const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride,
830 size_t width, size_t height, size_t channels, size_t kernel_width,
831 size_t kernel_height, kleidicv_border_type_t border_type,
832 kleidicv_thread_multithreading mt) {
833 1064 auto result_pair = kleidicv::median_blur_is_implemented(
834 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
835 532 kernel_height, border_type);
836
837 532 auto checks_result = result_pair.first;
838 532 auto fixed_border_type = result_pair.second;
839
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
840 432 return checks_result;
841 }
842
843 252 auto callback = [=](unsigned y_begin, unsigned y_end) {
844 304 return kleidicv_median_blur_sorting_network_stripe_u16(
845 152 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
846 152 channels, kernel_width, kernel_height, fixed_border_type);
847 };
848 100 return parallel_batches(callback, mt, height);
849 532 }
850
851 532 kleidicv_error_t kleidicv_thread_median_blur_f32(
852 const float *src, size_t src_stride, float *dst, size_t dst_stride,
853 size_t width, size_t height, size_t channels, size_t kernel_width,
854 size_t kernel_height, kleidicv_border_type_t border_type,
855 kleidicv_thread_multithreading mt) {
856 1064 auto result_pair = kleidicv::median_blur_is_implemented(
857 532 src, src_stride, dst, dst_stride, width, height, channels, kernel_width,
858 532 kernel_height, border_type);
859
860 532 auto checks_result = result_pair.first;
861 532 auto fixed_border_type = result_pair.second;
862
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
532 if (checks_result != KLEIDICV_OK) {
863 432 return checks_result;
864 }
865
866 252 auto callback = [=](unsigned y_begin, unsigned y_end) {
867 304 return kleidicv_median_blur_sorting_network_stripe_f32(
868 152 src, src_stride, dst, dst_stride, width, height, y_begin, y_end,
869 152 channels, kernel_width, kernel_height, fixed_border_type);
870 };
871 100 return parallel_batches(callback, mt, height);
872 532 }
873
874 204 kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8(
875 const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride,
876 size_t width, size_t height, size_t channels,
877 kleidicv_thread_multithreading mt) {
878
2/2
✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
204 if (!kleidicv::sobel_is_implemented(width, height, 3)) {
879 92 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
880 }
881
882 272 auto callback = [=](unsigned y_begin, unsigned y_end) {
883 320 return kleidicv_sobel_3x3_vertical_stripe_s16_u8(src, src_stride, dst,
884 160 dst_stride, width, height,
885 160 y_begin, y_end, channels);
886 };
887 112 return parallel_batches(callback, mt, height);
888 204 }
889
890 104 kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8(
891 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
892 size_t src_channels, int16_t *dst, size_t dst_stride,
893 kleidicv_thread_multithreading mt) {
894
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 100 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 100 times.
208 if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height,
895 104 src_channels)) {
896 4 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
897 }
898
899 244 auto callback = [=](unsigned y_begin, unsigned y_end) {
900 288 return kleidicv_scharr_interleaved_stripe_s16_u8(
901 144 src, src_stride, src_width, src_height, src_channels, dst, dst_stride,
902 144 y_begin, y_end);
903 };
904
905 // height is decremented by 2 as the result has less rows.
906 100 return parallel_batches(callback, mt, src_height - 2);
907 104 }
908
909 120 kleidicv_error_t kleidicv_thread_resize_to_quarter_u8(
910 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
911 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
912 kleidicv_thread_multithreading mt) {
913 240 auto callback = [=](unsigned begin, unsigned end) {
914 120 size_t src_begin = size_t{begin} * 2;
915 120 size_t src_end = std::min<size_t>(src_height, size_t{end} * 2);
916 120 size_t dst_begin = begin;
917 120 size_t dst_end = std::min<size_t>(dst_height, end);
918
919 // half of odd height is rounded towards zero?
920
2/2
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 110 times.
120 if (dst_begin == dst_end) {
921 10 return KLEIDICV_OK;
922 }
923
924 220 return kleidicv_resize_to_quarter_u8(
925 110 src + src_begin * src_stride, src_stride, src_width,
926 110 src_end - src_begin, dst + dst_begin * dst_stride, dst_stride,
927 110 dst_width, dst_end - dst_begin);
928 120 };
929 240 return parallel_batches(callback, mt, (src_height + 1) / 2);
930 120 }
931
932 125 kleidicv_error_t kleidicv_thread_resize_linear_u8(
933 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
934 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
935 kleidicv_thread_multithreading mt) {
936
4/4
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 120 times.
250 if (!kleidicv::resize_linear_u8_is_implemented(src_width, src_height,
937 125 dst_width, dst_height)) {
938 5 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
939 }
940 250 auto callback = [=](unsigned y_begin, unsigned y_end) {
941 260 return kleidicv_resize_linear_stripe_u8(
942 130 src, src_stride, src_width, src_height, y_begin,
943 130 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
944 130 dst_height);
945 };
946 120 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
947 125 }
948
949 185 kleidicv_error_t kleidicv_thread_resize_linear_f32(
950 const float *src, size_t src_stride, size_t src_width, size_t src_height,
951 float *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
952 kleidicv_thread_multithreading mt) {
953
4/4
✓ Branch 0 taken 5 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 180 times.
370 if (!kleidicv::resize_linear_f32_is_implemented(src_width, src_height,
954 185 dst_width, dst_height)) {
955 5 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
956 }
957 375 auto callback = [=](unsigned y_begin, unsigned y_end) {
958 390 return kleidicv_resize_linear_stripe_f32(
959 195 src, src_stride, src_width, src_height, y_begin,
960 195 std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width,
961 195 dst_height);
962 };
963 180 return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1));
964 185 }
965
966 208 kleidicv_error_t kleidicv_thread_remap_s16_u8(
967 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
968 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
969 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
970 kleidicv_border_type_t border_type, const uint8_t *border_value,
971 kleidicv_thread_multithreading mt) {
972
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint8_t>(src_stride, src_width,
973 208 src_height, dst_width,
974 208 border_type, channels)) {
975 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
976 }
977 488 auto callback = [=](unsigned begin, unsigned end) {
978 576 return kleidicv_remap_s16_u8(
979 288 src, src_stride, src_width, src_height,
980 288 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
981 288 end - begin, channels,
982 288 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
983 288 mapxy_stride, border_type, border_value);
984 };
985 200 return parallel_batches(callback, mt, dst_height);
986 208 }
987
988 208 kleidicv_error_t kleidicv_thread_remap_s16_u16(
989 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
990 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
991 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
992 kleidicv_border_type_t border_type, const uint16_t *border_value,
993 kleidicv_thread_multithreading mt) {
994
4/4
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
416 if (!kleidicv::remap_s16_is_implemented<uint16_t>(src_stride, src_width,
995 208 src_height, dst_width,
996 208 border_type, channels)) {
997 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
998 }
999 488 auto callback = [=](unsigned begin, unsigned end) {
1000 576 return kleidicv_remap_s16_u16(
1001 288 src, src_stride, src_width, src_height,
1002 288 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
1003 288 dst_stride, dst_width, end - begin, channels,
1004 288 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
1005 288 mapxy_stride, border_type, border_value);
1006 };
1007 200 return parallel_batches(callback, mt, dst_height);
1008 208 }
1009
1010 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u8(
1011 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1012 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1013 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
1014 const uint16_t *mapfrac, size_t mapfrac_stride,
1015 kleidicv_border_type_t border_type, const uint8_t *border_value,
1016 kleidicv_thread_multithreading mt) {
1017
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint8_t>(
1018 408 src_stride, src_width, src_height, dst_width, border_type,
1019 408 channels)) {
1020 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1021 }
1022 976 auto callback = [=](unsigned begin, unsigned end) {
1023 1152 return kleidicv_remap_s16point5_u8(
1024 576 src, src_stride, src_width, src_height,
1025 576 dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width,
1026 576 end - begin, channels,
1027 576 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
1028 576 mapxy_stride,
1029 1152 mapfrac +
1030 576 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
1031 576 mapfrac_stride, border_type, border_value);
1032 };
1033 400 return parallel_batches(callback, mt, dst_height);
1034 408 }
1035
1036 408 kleidicv_error_t kleidicv_thread_remap_s16point5_u16(
1037 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
1038 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1039 size_t channels, const int16_t *mapxy, size_t mapxy_stride,
1040 const uint16_t *mapfrac, size_t mapfrac_stride,
1041 kleidicv_border_type_t border_type, const uint16_t *border_value,
1042 kleidicv_thread_multithreading mt) {
1043
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_s16point5_is_implemented<uint16_t>(
1044 408 src_stride, src_width, src_height, dst_width, border_type,
1045 408 channels)) {
1046 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1047 }
1048 976 auto callback = [=](unsigned begin, unsigned end) {
1049 1152 return kleidicv_remap_s16point5_u16(
1050 576 src, src_stride, src_width, src_height,
1051 576 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
1052 576 dst_stride, dst_width, end - begin, channels,
1053 576 mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)),
1054 576 mapxy_stride,
1055 1152 mapfrac +
1056 576 static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)),
1057 576 mapfrac_stride, border_type, border_value);
1058 };
1059 400 return parallel_batches(callback, mt, dst_height);
1060 408 }
1061
1062 408 kleidicv_error_t kleidicv_thread_remap_f32_u8(
1063 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1064 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1065 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
1066 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
1067 kleidicv_border_type_t border_type, const uint8_t *border_value,
1068 kleidicv_thread_multithreading mt) {
1069
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint8_t>(
1070 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
1071 408 channels, interpolation)) {
1072 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1073 }
1074 976 auto callback = [=](unsigned begin, unsigned end) {
1075 1152 return kleidicv_remap_f32_u8(
1076 576 src, src_stride, src_width, src_height,
1077 576 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)),
1078 576 dst_stride, dst_width, end - begin, channels,
1079 576 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
1080 576 mapx_stride,
1081 576 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
1082 576 mapy_stride, interpolation, border_type, border_value);
1083 };
1084 400 return parallel_batches(callback, mt, dst_height);
1085 408 }
1086
1087 408 kleidicv_error_t kleidicv_thread_remap_f32_u16(
1088 const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height,
1089 uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1090 size_t channels, const float *mapx, size_t mapx_stride, const float *mapy,
1091 size_t mapy_stride, kleidicv_interpolation_type_t interpolation,
1092 kleidicv_border_type_t border_type, const uint16_t *border_value,
1093 kleidicv_thread_multithreading mt) {
1094
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
408 if (!kleidicv::remap_f32_is_implemented<uint16_t>(
1095 408 src_stride, src_width, src_height, dst_width, dst_height, border_type,
1096 408 channels, interpolation)) {
1097 8 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1098 }
1099 976 auto callback = [=](unsigned begin, unsigned end) {
1100 1152 return kleidicv_remap_f32_u16(
1101 576 src, src_stride, src_width, src_height,
1102 576 dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)),
1103 576 dst_stride, dst_width, end - begin, channels,
1104 576 mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)),
1105 576 mapx_stride,
1106 576 mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)),
1107 576 mapy_stride, interpolation, border_type, border_value);
1108 };
1109 400 return parallel_batches(callback, mt, dst_height);
1110 408 }
1111
1112 216 kleidicv_error_t kleidicv_thread_warp_perspective_u8(
1113 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
1114 uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
1115 const float transformation[9], size_t channels,
1116 kleidicv_interpolation_type_t interpolation,
1117 kleidicv_border_type_t border_type, const uint8_t *border_value,
1118 kleidicv_thread_multithreading mt) {
1119
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 200 times.
216 if (!kleidicv::warp_perspective_is_implemented<uint8_t>(
1120 216 dst_width, channels, interpolation, border_type)) {
1121 16 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
1122 }
1123
1124 488 auto callback = [=](unsigned y_begin, unsigned y_end) {
1125 576 return kleidicv_warp_perspective_stripe_u8(
1126 288 src, src_stride, src_width, src_height, dst, dst_stride, dst_width,
1127 288 dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1),
1128 288 transformation, channels, interpolation, border_type, border_value);
1129 };
1130 200 return parallel_batches(callback, mt, dst_height);
1131 216 }
1132