Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv_thread/kleidicv_thread.h" | ||
6 | |||
7 | #include <algorithm> | ||
8 | #include <cstddef> | ||
9 | #include <cstdint> | ||
10 | #include <functional> | ||
11 | #include <limits> | ||
12 | #include <vector> | ||
13 | |||
14 | #include "kleidicv/arithmetics/rotate.h" | ||
15 | #include "kleidicv/arithmetics/scale.h" | ||
16 | #include "kleidicv/conversions/rgb_to_yuv_420.h" | ||
17 | #include "kleidicv/conversions/yuv_420_to_rgb.h" | ||
18 | #include "kleidicv/ctypes.h" | ||
19 | #include "kleidicv/filters/blur_and_downsample.h" | ||
20 | #include "kleidicv/filters/gaussian_blur.h" | ||
21 | #include "kleidicv/filters/median_blur.h" | ||
22 | #include "kleidicv/filters/scharr.h" | ||
23 | #include "kleidicv/filters/separable_filter_2d.h" | ||
24 | #include "kleidicv/filters/sobel.h" | ||
25 | #include "kleidicv/kleidicv.h" | ||
26 | #include "kleidicv/resize/resize_linear.h" | ||
27 | #include "kleidicv/transform/remap.h" | ||
28 | #include "kleidicv/transform/warp_perspective.h" | ||
29 | |||
30 | typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback; | ||
31 | |||
32 | 12110 | static kleidicv_error_t kleidicv_thread_std_function_callback( | |
33 | unsigned task_begin, unsigned task_end, void *data) { | ||
34 | 12110 | auto *callback = reinterpret_cast<FunctionCallback *>(data); | |
35 | 24220 | return (*callback)(task_begin, task_end); | |
36 | 12110 | } | |
37 | |||
38 | // Operations in the Neon backend have both a vector path and a scalar path. | ||
39 | // The vector path is used to process most data and the scalar path is used to | ||
40 | // process the parts of the data that don't fit into the vector width. | ||
41 | // For floating point operations in particular, the results may be very slightly | ||
42 | // different between vector and scalar paths. | ||
43 | // When using multithreading, images are divided into parts to be processed by | ||
44 | // each thread, and this could change which parts of the data end up being | ||
45 | // processed by the vector and scalar paths. Since the threading may be | ||
46 | // non-deterministic in how it divides up the image, this non-determinism could | ||
47 | // leak through in the values of the output. This could cause subtle bugs. | ||
48 | // | ||
49 | // To avoid this problem, this function passes data to each thread in batches | ||
50 | // that are a multiple of the Neon vector width in size (16 bytes). The | ||
51 | // exception is the last batch, which may be longer in order to extend to the | ||
52 | // end of the data. No batch can be shorter than vector length as this could | ||
53 | // cause different behaviour for operations that try to avoid the tail loop (see | ||
54 | // the TryToAvoidTailLoop class) - this technique only works if the data is | ||
55 | // longer than vector length. | ||
56 | // | ||
57 | // Typically with how this function is used, batches will be 16 image rows or | ||
58 | // row pairs, which is likely to be far coarser alignment than is needed. | ||
59 | // However it's unlikely that threading on a finer-grained level would provide a | ||
60 | // performance benefit. | ||
61 | template <typename Callback> | ||
62 | 8967 | inline kleidicv_error_t parallel_batches(Callback callback, | |
63 | kleidicv_thread_multithreading mt, | ||
64 | unsigned count, | ||
65 | unsigned min_batch_size = 16) { | ||
66 | 8967 | const unsigned task_count = std::max(1U, (count) / min_batch_size); | |
67 | 21077 | FunctionCallback f = [=](unsigned task_begin, unsigned task_end) { | |
68 | 12110 | unsigned begin = task_begin * min_batch_size, | |
69 | 12110 | end = task_end * min_batch_size; | |
70 |
138/140✓ Branch 0 taken 594 times.
✓ Branch 1 taken 1350 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 90 times.
✓ Branch 4 taken 33 times.
✓ Branch 5 taken 75 times.
✓ Branch 6 taken 33 times.
✓ Branch 7 taken 75 times.
✓ Branch 8 taken 33 times.
✓ Branch 9 taken 75 times.
✓ Branch 10 taken 33 times.
✓ Branch 11 taken 75 times.
✓ Branch 12 taken 66 times.
✓ Branch 13 taken 150 times.
✓ Branch 14 taken 33 times.
✓ Branch 15 taken 75 times.
✓ Branch 16 taken 33 times.
✓ Branch 17 taken 84 times.
✓ Branch 18 taken 36 times.
✓ Branch 19 taken 90 times.
✓ Branch 20 taken 99 times.
✓ Branch 21 taken 225 times.
✓ Branch 22 taken 198 times.
✓ Branch 23 taken 450 times.
✓ Branch 24 taken 99 times.
✓ Branch 25 taken 225 times.
✓ Branch 26 taken 99 times.
✓ Branch 27 taken 225 times.
✓ Branch 28 taken 99 times.
✓ Branch 29 taken 225 times.
✓ Branch 30 taken 66 times.
✓ Branch 31 taken 150 times.
✓ Branch 32 taken 66 times.
✓ Branch 33 taken 150 times.
✓ Branch 34 taken 66 times.
✓ Branch 35 taken 150 times.
✓ Branch 36 taken 33 times.
✓ Branch 37 taken 75 times.
✓ Branch 38 taken 33 times.
✓ Branch 39 taken 75 times.
✓ Branch 40 taken 33 times.
✓ Branch 41 taken 75 times.
✓ Branch 42 taken 33 times.
✓ Branch 43 taken 75 times.
✓ Branch 44 taken 33 times.
✓ Branch 45 taken 75 times.
✓ Branch 46 taken 33 times.
✓ Branch 47 taken 75 times.
✓ Branch 48 taken 72 times.
✓ Branch 49 taken 120 times.
✓ Branch 50 taken 4 times.
✓ Branch 51 taken 108 times.
✓ Branch 52 taken 4 times.
✓ Branch 53 taken 108 times.
✓ Branch 54 taken 4 times.
✓ Branch 55 taken 108 times.
✓ Branch 56 taken 4 times.
✓ Branch 57 taken 108 times.
✓ Branch 58 taken 4 times.
✓ Branch 59 taken 56 times.
✓ Branch 60 taken 4 times.
✓ Branch 61 taken 56 times.
✓ Branch 62 taken 4 times.
✓ Branch 63 taken 56 times.
✓ Branch 64 taken 4 times.
✓ Branch 65 taken 56 times.
✓ Branch 66 taken 4 times.
✓ Branch 67 taken 56 times.
✓ Branch 68 taken 4 times.
✓ Branch 69 taken 56 times.
✓ Branch 70 taken 4 times.
✓ Branch 71 taken 56 times.
✓ Branch 72 taken 4 times.
✓ Branch 73 taken 56 times.
✗ Branch 74 not taken.
✓ Branch 75 taken 608 times.
✓ Branch 76 taken 3 times.
✓ Branch 77 taken 45 times.
✓ Branch 78 taken 3 times.
✓ Branch 79 taken 45 times.
✓ Branch 80 taken 3 times.
✓ Branch 81 taken 45 times.
✓ Branch 82 taken 3 times.
✓ Branch 83 taken 45 times.
✓ Branch 84 taken 3 times.
✓ Branch 85 taken 45 times.
✓ Branch 86 taken 3 times.
✓ Branch 87 taken 51 times.
✓ Branch 88 taken 3 times.
✓ Branch 89 taken 45 times.
✓ Branch 90 taken 12 times.
✓ Branch 91 taken 21 times.
✓ Branch 92 taken 3 times.
✓ Branch 93 taken 3 times.
✓ Branch 94 taken 12 times.
✓ Branch 95 taken 21 times.
✓ Branch 96 taken 12 times.
✓ Branch 97 taken 21 times.
✓ Branch 98 taken 12 times.
✓ Branch 99 taken 21 times.
✓ Branch 100 taken 12 times.
✓ Branch 101 taken 21 times.
✓ Branch 102 taken 36 times.
✓ Branch 103 taken 84 times.
✓ Branch 104 taken 39 times.
✓ Branch 105 taken 75 times.
✓ Branch 106 taken 9 times.
✓ Branch 107 taken 9 times.
✓ Branch 108 taken 3 times.
✓ Branch 109 taken 3 times.
✓ Branch 110 taken 39 times.
✓ Branch 111 taken 75 times.
✓ Branch 112 taken 39 times.
✓ Branch 113 taken 75 times.
✓ Branch 114 taken 39 times.
✓ Branch 115 taken 75 times.
✓ Branch 116 taken 36 times.
✓ Branch 117 taken 84 times.
✓ Branch 118 taken 33 times.
✓ Branch 119 taken 75 times.
✗ Branch 120 not taken.
✓ Branch 121 taken 96 times.
✓ Branch 122 taken 8 times.
✓ Branch 123 taken 96 times.
✓ Branch 124 taken 12 times.
✓ Branch 125 taken 144 times.
✓ Branch 126 taken 66 times.
✓ Branch 127 taken 150 times.
✓ Branch 128 taken 66 times.
✓ Branch 129 taken 150 times.
✓ Branch 130 taken 132 times.
✓ Branch 131 taken 300 times.
✓ Branch 132 taken 132 times.
✓ Branch 133 taken 300 times.
✓ Branch 134 taken 132 times.
✓ Branch 135 taken 300 times.
✓ Branch 136 taken 132 times.
✓ Branch 137 taken 300 times.
✓ Branch 138 taken 66 times.
✓ Branch 139 taken 150 times.
|
12110 | if (task_end == task_count) { |
71 | 8967 | end = count; | |
72 | 8967 | } | |
73 | 24220 | return callback(begin, end); | |
74 | 12110 | }; | |
75 | 17934 | return mt.parallel(kleidicv_thread_std_function_callback, &f, | |
76 | 8967 | mt.parallel_data, task_count); | |
77 | 8967 | } | |
78 | |||
79 | template <typename SrcT, typename DstT, typename F, typename... Args> | ||
80 | 2139 | inline kleidicv_error_t kleidicv_thread_unary_op_impl( | |
81 | F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride, | ||
82 | DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) { | ||
83 | 5208 | auto callback = [=](unsigned begin, unsigned end) { | |
84 | 6138 | return f(src + static_cast<ptrdiff_t>(begin * src_stride / sizeof(SrcT)), | |
85 | 3069 | src_stride, | |
86 | 3069 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)), | |
87 | 3069 | dst_stride, width, end - begin, args...); | |
88 | }; | ||
89 | 4278 | return parallel_batches(callback, mt, height); | |
90 | 2139 | } | |
91 | |||
92 | template <typename SrcT, typename DstT, typename F, typename... Args> | ||
93 | 2250 | inline kleidicv_error_t kleidicv_thread_binary_op_impl( | |
94 | F f, kleidicv_thread_multithreading mt, const SrcT *src_a, | ||
95 | size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst, | ||
96 | size_t dst_stride, size_t width, size_t height, Args... args) { | ||
97 | 5490 | auto callback = [=](unsigned begin, unsigned end) { | |
98 | 6480 | return f( | |
99 | 3240 | src_a + static_cast<ptrdiff_t>(begin * src_a_stride / sizeof(SrcT)), | |
100 | 3240 | src_a_stride, | |
101 | 3240 | src_b + static_cast<ptrdiff_t>(begin * src_b_stride / sizeof(SrcT)), | |
102 | 3240 | src_b_stride, | |
103 | 3240 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)), | |
104 | 3240 | dst_stride, width, end - begin, args...); | |
105 | }; | ||
106 | 4500 | return parallel_batches(callback, mt, height); | |
107 | 2250 | } | |
108 | |||
109 | #define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \ | ||
110 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
111 | const src_type *src, size_t src_stride, dst_type *dst, \ | ||
112 | size_t dst_stride, size_t width, size_t height, \ | ||
113 | kleidicv_thread_multithreading mt) { \ | ||
114 | return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \ | ||
115 | src_stride, dst, dst_stride, width, \ | ||
116 | height); \ | ||
117 | } | ||
118 | |||
119 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t); | |
120 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t); | |
121 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t); | |
122 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t); | |
123 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t); | |
124 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t); | |
125 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t); | |
126 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t); | |
127 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t); | |
128 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t); | |
129 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgr_u8, uint8_t, uint8_t); | |
130 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgra_u8, uint8_t, uint8_t); | |
131 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgb_u8, uint8_t, uint8_t); | |
132 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgba_u8, uint8_t, uint8_t); | |
133 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(bgr_to_yuv_u8, uint8_t, uint8_t); | |
134 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_yuv_u8, uint8_t, uint8_t); | |
135 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(bgra_to_yuv_u8, uint8_t, uint8_t); | |
136 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_yuv_u8, uint8_t, uint8_t); | |
137 | 90 | KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float); | |
138 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_s8, float, int8_t); | |
139 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_u8, float, uint8_t); | |
140 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(s8_to_f32, int8_t, float); | |
141 | 75 | KLEIDICV_THREAD_UNARY_OP_IMPL(u8_to_f32, uint8_t, float); | |
142 | |||
143 | #define KLEIDICV_THREAD_INRANGE_OP_IMPL(suffix, src_type, dst_type) \ | ||
144 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
145 | const src_type *src, size_t src_stride, dst_type *dst, \ | ||
146 | size_t dst_stride, size_t width, size_t height, src_type lower_bound, \ | ||
147 | src_type upper_bound, kleidicv_thread_multithreading mt) { \ | ||
148 | return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \ | ||
149 | src_stride, dst, dst_stride, width, \ | ||
150 | height, lower_bound, upper_bound); \ | ||
151 | } | ||
152 | |||
153 | 75 | KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_u8, uint8_t, uint8_t); | |
154 | 75 | KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_f32, float, uint8_t); | |
155 | |||
156 | 75 | kleidicv_error_t kleidicv_thread_threshold_binary_u8( | |
157 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
158 | size_t width, size_t height, uint8_t threshold, uint8_t value, | ||
159 | kleidicv_thread_multithreading mt) { | ||
160 | 150 | return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src, | |
161 | 75 | src_stride, dst, dst_stride, width, | |
162 | 75 | height, threshold, value); | |
163 | } | ||
164 | |||
165 | 93 | kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, | |
166 | uint8_t *dst, size_t dst_stride, | ||
167 | size_t width, size_t height, | ||
168 | float scale, float shift, | ||
169 | kleidicv_thread_multithreading mt) { | ||
170 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 90 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 90 times.
|
93 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
171 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 87 times.
|
90 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
172 |
5/6✗ Branch 0 not taken.
✓ Branch 1 taken 87 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 84 times.
|
87 | CHECK_IMAGE_SIZE(width, height); |
173 | |||
174 | 84 | const std::array<uint8_t, 256> precalculated_table = | |
175 | 84 | kleidicv::neon::precalculate_scale_table_u8(scale, shift); | |
176 | 84 | return kleidicv_thread_unary_op_impl( | |
177 | 84 | kleidicv::neon::scale_with_precalculated_table, mt, src, src_stride, dst, | |
178 | 84 | dst_stride, width, height, scale, shift, precalculated_table); | |
179 | 93 | } | |
180 | |||
181 | 90 | kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, | |
182 | float *dst, size_t dst_stride, | ||
183 | size_t width, size_t height, | ||
184 | float scale, float shift, | ||
185 | kleidicv_thread_multithreading mt) { | ||
186 | 180 | return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride, | |
187 | 90 | dst, dst_stride, width, height, scale, | |
188 | 90 | shift); | |
189 | } | ||
190 | |||
191 | #define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \ | ||
192 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
193 | const type *src_a, size_t src_a_stride, const type *src_b, \ | ||
194 | size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ | ||
195 | size_t height, kleidicv_thread_multithreading mt) { \ | ||
196 | return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \ | ||
197 | src_a_stride, src_b, src_b_stride, \ | ||
198 | dst, dst_stride, width, height); \ | ||
199 | } | ||
200 | |||
201 | #define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \ | ||
202 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
203 | const type *src_a, size_t src_a_stride, const type *src_b, \ | ||
204 | size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ | ||
205 | size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \ | ||
206 | return kleidicv_thread_binary_op_impl( \ | ||
207 | kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \ | ||
208 | dst_stride, width, height, scale); \ | ||
209 | } | ||
210 | |||
211 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t); | |
212 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t); | |
213 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t); | |
214 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t); | |
215 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t); | |
216 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t); | |
217 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t); | |
218 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t); | |
219 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t); | |
220 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t); | |
221 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t); | |
222 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t); | |
223 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t); | |
224 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t); | |
225 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t); | |
226 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t); | |
227 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t); | |
228 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t); | |
229 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t); | |
230 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t); | |
231 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t); | |
232 | 75 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double); | |
233 | 75 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double); | |
234 | 75 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double); | |
235 | 75 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double); | |
236 | 75 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double); | |
237 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t); | |
238 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t); | |
239 | 75 | KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t); | |
240 | |||
241 | 75 | kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( | |
242 | const int16_t *src_a, size_t src_a_stride, const int16_t *src_b, | ||
243 | size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width, | ||
244 | size_t height, int16_t threshold, kleidicv_thread_multithreading mt) { | ||
245 | 75 | return kleidicv_thread_binary_op_impl( | |
246 | 75 | kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride, | |
247 | 75 | src_b, src_b_stride, dst, dst_stride, width, height, threshold); | |
248 | } | ||
249 | |||
250 | 129 | kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride, | |
251 | size_t width, size_t height, void *dst, | ||
252 | size_t dst_stride, int angle, | ||
253 | size_t element_size, | ||
254 | kleidicv_thread_multithreading mt) { | ||
255 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 120 times.
|
129 | if (!kleidicv::rotate_is_implemented(src, dst, angle, element_size)) { |
256 | 9 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
257 | } | ||
258 | // reading in columns and writing out rows tends to perform better | ||
259 | 312 | auto callback = [=](unsigned begin, unsigned end) { | |
260 | 384 | return kleidicv_rotate( | |
261 | 192 | static_cast<const uint8_t *>(src) + begin * element_size, src_stride, | |
262 | 192 | end - begin, height, static_cast<uint8_t *>(dst) + begin * dst_stride, | |
263 | 192 | dst_stride, angle, element_size); | |
264 | }; | ||
265 | 120 | return parallel_batches(callback, mt, width, 64); | |
266 | 129 | } | |
267 | |||
268 | 108 | kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8( | |
269 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
270 | size_t width, size_t height, bool is_yv12, | ||
271 | kleidicv_thread_multithreading mt) { | ||
272 | 220 | auto callback = [=](unsigned begin, unsigned end) { | |
273 | 224 | return kleidicv_yuv_p_to_bgr_stripe_u8( | |
274 | 112 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
275 | 112 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
276 | }; | ||
277 | 216 | return parallel_batches(callback, mt, (height + 1) / 2); | |
278 | 108 | } | |
279 | |||
280 | 108 | kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8( | |
281 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
282 | size_t width, size_t height, bool is_yv12, | ||
283 | kleidicv_thread_multithreading mt) { | ||
284 | 220 | auto callback = [=](unsigned begin, unsigned end) { | |
285 | 224 | return kleidicv_yuv_p_to_bgra_stripe_u8( | |
286 | 112 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
287 | 112 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
288 | }; | ||
289 | 216 | return parallel_batches(callback, mt, (height + 1) / 2); | |
290 | 108 | } | |
291 | |||
292 | 108 | kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8( | |
293 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
294 | size_t width, size_t height, bool is_yv12, | ||
295 | kleidicv_thread_multithreading mt) { | ||
296 | 220 | auto callback = [=](unsigned begin, unsigned end) { | |
297 | 224 | return kleidicv_yuv_p_to_rgb_stripe_u8( | |
298 | 112 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
299 | 112 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
300 | }; | ||
301 | 216 | return parallel_batches(callback, mt, (height + 1) / 2); | |
302 | 108 | } | |
303 | |||
304 | 108 | kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8( | |
305 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
306 | size_t width, size_t height, bool is_yv12, | ||
307 | kleidicv_thread_multithreading mt) { | ||
308 | 220 | auto callback = [=](unsigned begin, unsigned end) { | |
309 | 224 | return kleidicv_yuv_p_to_rgba_stripe_u8( | |
310 | 112 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
311 | 112 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
312 | }; | ||
313 | 216 | return parallel_batches(callback, mt, (height + 1) / 2); | |
314 | 108 | } | |
315 | |||
316 | 56 | kleidicv_error_t kleidicv_thread_rgb_to_yuv420_p_u8( | |
317 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
318 | size_t width, size_t height, bool is_yv12, | ||
319 | kleidicv_thread_multithreading mt) { | ||
320 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
321 | 120 | return kleidicv_rgb_to_yuv420_p_stripe_u8( | |
322 | 60 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
323 | 60 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
324 | }; | ||
325 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
326 | 56 | } | |
327 | |||
328 | 56 | kleidicv_error_t kleidicv_thread_rgba_to_yuv420_p_u8( | |
329 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
330 | size_t width, size_t height, bool is_yv12, | ||
331 | kleidicv_thread_multithreading mt) { | ||
332 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
333 | 120 | return kleidicv_rgba_to_yuv420_p_stripe_u8( | |
334 | 60 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
335 | 60 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
336 | }; | ||
337 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
338 | 56 | } | |
339 | |||
340 | 56 | kleidicv_error_t kleidicv_thread_bgr_to_yuv420_p_u8( | |
341 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
342 | size_t width, size_t height, bool is_yv12, | ||
343 | kleidicv_thread_multithreading mt) { | ||
344 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
345 | 120 | return kleidicv_bgr_to_yuv420_p_stripe_u8( | |
346 | 60 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
347 | 60 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
348 | }; | ||
349 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
350 | 56 | } | |
351 | |||
352 | 56 | kleidicv_error_t kleidicv_thread_bgra_to_yuv420_p_u8( | |
353 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
354 | size_t width, size_t height, bool is_yv12, | ||
355 | kleidicv_thread_multithreading mt) { | ||
356 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
357 | 120 | return kleidicv_bgra_to_yuv420_p_stripe_u8( | |
358 | 60 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
359 | 60 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
360 | }; | ||
361 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
362 | 56 | } | |
363 | |||
364 | 56 | kleidicv_error_t kleidicv_thread_rgb_to_yuv420_sp_u8( | |
365 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
366 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
367 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
368 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
369 | 120 | return kleidicv_rgb_to_yuv420_sp_stripe_u8( | |
370 | 60 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
371 | 60 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
372 | }; | ||
373 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
374 | 56 | } | |
375 | |||
376 | 56 | kleidicv_error_t kleidicv_thread_rgba_to_yuv420_sp_u8( | |
377 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
378 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
379 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
380 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
381 | 120 | return kleidicv_rgba_to_yuv420_sp_stripe_u8( | |
382 | 60 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
383 | 60 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
384 | }; | ||
385 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
386 | 56 | } | |
387 | |||
388 | 56 | kleidicv_error_t kleidicv_thread_bgr_to_yuv420_sp_u8( | |
389 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
390 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
391 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
392 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
393 | 120 | return kleidicv_bgr_to_yuv420_sp_stripe_u8( | |
394 | 60 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
395 | 60 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
396 | }; | ||
397 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
398 | 56 | } | |
399 | |||
400 | 56 | kleidicv_error_t kleidicv_thread_bgra_to_yuv420_sp_u8( | |
401 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
402 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
403 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
404 | 116 | auto callback = [=](unsigned begin, unsigned end) { | |
405 | 120 | return kleidicv_bgra_to_yuv420_sp_stripe_u8( | |
406 | 60 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
407 | 60 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
408 | }; | ||
409 | 112 | return parallel_batches(callback, mt, (height + 1) / 2); | |
410 | 56 | } | |
411 | |||
412 | template <typename F> | ||
413 | 608 | inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( | |
414 | F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, | ||
415 | size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, | ||
416 | size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { | ||
417 | 1216 | auto callback = [=](unsigned begin, unsigned end) { | |
418 | 608 | size_t row_begin = size_t{begin} * 2; | |
419 | 608 | size_t row_end = std::min<size_t>(height, size_t{end} * 2); | |
420 | 608 | size_t row_uv = begin; | |
421 | 1824 | return f(src_y + row_begin * src_y_stride, src_y_stride, | |
422 | 608 | src_uv + row_uv * src_uv_stride, src_uv_stride, | |
423 | 608 | dst + row_begin * dst_stride, dst_stride, width, | |
424 | 608 | row_end - row_begin, is_nv21); | |
425 | 608 | }; | |
426 | 1216 | return parallel_batches(callback, mt, (height + 1) / 2); | |
427 | 608 | } | |
428 | |||
429 | #define YUV_SP_TO_RGB(suffix) \ | ||
430 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
431 | const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, \ | ||
432 | size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, \ | ||
433 | size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { \ | ||
434 | return kleidicv_thread_yuv_sp_to_rgb_u8_impl( \ | ||
435 | kleidicv_##suffix, src_y, src_y_stride, src_uv, src_uv_stride, dst, \ | ||
436 | dst_stride, width, height, is_nv21, mt); \ | ||
437 | } | ||
438 | |||
439 | 152 | YUV_SP_TO_RGB(yuv_sp_to_bgr_u8); | |
440 | 152 | YUV_SP_TO_RGB(yuv_sp_to_bgra_u8); | |
441 | 152 | YUV_SP_TO_RGB(yuv_sp_to_rgb_u8); | |
442 | 152 | YUV_SP_TO_RGB(yuv_sp_to_rgba_u8); | |
443 | |||
444 | template <typename ScalarType, typename FunctionType> | ||
445 | 276 | kleidicv_error_t parallel_min_max(FunctionType min_max_func, | |
446 | const ScalarType *src, size_t src_stride, | ||
447 | size_t width, size_t height, | ||
448 | ScalarType *p_min_value, | ||
449 | ScalarType *p_max_value, | ||
450 | kleidicv_thread_multithreading mt) { | ||
451 | 552 | std::vector<ScalarType> min_values(height, | |
452 | 276 | std::numeric_limits<ScalarType>::max()); | |
453 | 552 | std::vector<ScalarType> max_values(height, | |
454 | 276 | std::numeric_limits<ScalarType>::lowest()); | |
455 | |||
456 | 570 | auto callback = [&](unsigned begin, unsigned end) { | |
457 | 588 | return min_max_func(src + begin * (src_stride / sizeof(ScalarType)), | |
458 | 294 | src_stride, width, end - begin, | |
459 |
12/12✓ Branch 0 taken 42 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 42 times.
✓ Branch 3 taken 6 times.
✓ Branch 4 taken 42 times.
✓ Branch 5 taken 6 times.
✓ Branch 6 taken 42 times.
✓ Branch 7 taken 6 times.
✓ Branch 8 taken 42 times.
✓ Branch 9 taken 6 times.
✓ Branch 10 taken 48 times.
✓ Branch 11 taken 6 times.
|
294 | p_min_value ? min_values.data() + begin : nullptr, |
460 |
12/12✓ Branch 0 taken 42 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 42 times.
✓ Branch 3 taken 6 times.
✓ Branch 4 taken 42 times.
✓ Branch 5 taken 6 times.
✓ Branch 6 taken 42 times.
✓ Branch 7 taken 6 times.
✓ Branch 8 taken 42 times.
✓ Branch 9 taken 6 times.
✓ Branch 10 taken 48 times.
✓ Branch 11 taken 6 times.
|
294 | p_max_value ? max_values.data() + begin : nullptr); |
461 | }; | ||
462 | |||
463 | 276 | auto return_val = parallel_batches(callback, mt, height); | |
464 | |||
465 |
12/12✓ Branch 0 taken 39 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 6 times.
✓ Branch 4 taken 39 times.
✓ Branch 5 taken 6 times.
✓ Branch 6 taken 39 times.
✓ Branch 7 taken 6 times.
✓ Branch 8 taken 39 times.
✓ Branch 9 taken 6 times.
✓ Branch 10 taken 45 times.
✓ Branch 11 taken 6 times.
|
276 | if (p_min_value) { |
466 | 240 | *p_min_value = std::numeric_limits<ScalarType>::max(); | |
467 |
12/12✓ Branch 0 taken 216 times.
✓ Branch 1 taken 39 times.
✓ Branch 2 taken 216 times.
✓ Branch 3 taken 39 times.
✓ Branch 4 taken 216 times.
✓ Branch 5 taken 39 times.
✓ Branch 6 taken 216 times.
✓ Branch 7 taken 39 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 39 times.
✓ Branch 10 taken 276 times.
✓ Branch 11 taken 45 times.
|
1596 | for (ScalarType m : min_values) { |
468 |
12/12✓ Branch 0 taken 177 times.
✓ Branch 1 taken 39 times.
✓ Branch 2 taken 177 times.
✓ Branch 3 taken 39 times.
✓ Branch 4 taken 176 times.
✓ Branch 5 taken 40 times.
✓ Branch 6 taken 176 times.
✓ Branch 7 taken 40 times.
✓ Branch 8 taken 176 times.
✓ Branch 9 taken 40 times.
✓ Branch 10 taken 230 times.
✓ Branch 11 taken 46 times.
|
1356 | if (m < *p_min_value) { |
469 | 244 | *p_min_value = m; | |
470 | 244 | } | |
471 | 1356 | } | |
472 | 240 | } | |
473 |
12/12✓ Branch 0 taken 39 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 39 times.
✓ Branch 3 taken 6 times.
✓ Branch 4 taken 39 times.
✓ Branch 5 taken 6 times.
✓ Branch 6 taken 39 times.
✓ Branch 7 taken 6 times.
✓ Branch 8 taken 39 times.
✓ Branch 9 taken 6 times.
✓ Branch 10 taken 45 times.
✓ Branch 11 taken 6 times.
|
276 | if (p_max_value) { |
474 | 240 | *p_max_value = std::numeric_limits<ScalarType>::lowest(); | |
475 |
12/12✓ Branch 0 taken 216 times.
✓ Branch 1 taken 39 times.
✓ Branch 2 taken 216 times.
✓ Branch 3 taken 39 times.
✓ Branch 4 taken 216 times.
✓ Branch 5 taken 39 times.
✓ Branch 6 taken 216 times.
✓ Branch 7 taken 39 times.
✓ Branch 8 taken 216 times.
✓ Branch 9 taken 39 times.
✓ Branch 10 taken 276 times.
✓ Branch 11 taken 45 times.
|
1596 | for (ScalarType m : max_values) { |
476 |
12/12✓ Branch 0 taken 176 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 176 times.
✓ Branch 3 taken 40 times.
✓ Branch 4 taken 175 times.
✓ Branch 5 taken 41 times.
✓ Branch 6 taken 175 times.
✓ Branch 7 taken 41 times.
✓ Branch 8 taken 175 times.
✓ Branch 9 taken 41 times.
✓ Branch 10 taken 230 times.
✓ Branch 11 taken 46 times.
|
1356 | if (m > *p_max_value) { |
477 | 249 | *p_max_value = m; | |
478 | 249 | } | |
479 | 1356 | } | |
480 | 240 | } | |
481 | 276 | return return_val; | |
482 | 276 | } | |
483 | |||
484 | #define DEFINE_KLEIDICV_THREAD_MIN_MAX(suffix, type) \ | ||
485 | kleidicv_error_t kleidicv_thread_min_max_##suffix( \ | ||
486 | const type *src, size_t src_stride, size_t width, size_t height, \ | ||
487 | type *p_min_value, type *p_max_value, \ | ||
488 | kleidicv_thread_multithreading mt) { \ | ||
489 | return parallel_min_max(kleidicv_min_max_##suffix, src, src_stride, width, \ | ||
490 | height, p_min_value, p_max_value, mt); \ | ||
491 | } | ||
492 | |||
493 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX(u8, uint8_t); | |
494 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s8, int8_t); | |
495 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t); | |
496 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t); | |
497 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t); | |
498 | 51 | DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float); | |
499 | |||
500 | template <typename ScalarType, typename FunctionType> | ||
501 | 45 | kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func, | |
502 | const ScalarType *src, size_t src_stride, | ||
503 | size_t width, size_t height, | ||
504 | size_t *p_min_offset, | ||
505 | size_t *p_max_offset, | ||
506 | kleidicv_thread_multithreading mt) { | ||
507 | 45 | std::vector<size_t> min_offsets(height, 0); | |
508 | 45 | std::vector<size_t> max_offsets(height, 0); | |
509 | |||
510 | 93 | auto callback = [&](unsigned begin, unsigned end) { | |
511 | 96 | return min_max_loc_func( | |
512 | 48 | src + begin * (src_stride / sizeof(ScalarType)), src_stride, width, | |
513 |
2/2✓ Branch 0 taken 42 times.
✓ Branch 1 taken 6 times.
|
48 | end - begin, p_min_offset ? min_offsets.data() + begin : nullptr, |
514 |
2/2✓ Branch 0 taken 42 times.
✓ Branch 1 taken 6 times.
|
48 | p_max_offset ? max_offsets.data() + begin : nullptr); |
515 | }; | ||
516 | 45 | auto return_val = parallel_batches(callback, mt, height); | |
517 | |||
518 |
2/2✓ Branch 0 taken 39 times.
✓ Branch 1 taken 6 times.
|
45 | if (p_min_offset) { |
519 | 39 | *p_min_offset = 0; | |
520 |
2/2✓ Branch 0 taken 216 times.
✓ Branch 1 taken 39 times.
|
255 | for (size_t i = 0; i < min_offsets.size(); ++i) { |
521 | 216 | size_t offs = min_offsets[i] + i * src_stride; | |
522 |
4/4✓ Branch 0 taken 190 times.
✓ Branch 1 taken 26 times.
✓ Branch 2 taken 190 times.
✓ Branch 3 taken 26 times.
|
432 | if (src[offs / sizeof(ScalarType)] < |
523 | 216 | src[*p_min_offset / sizeof(ScalarType)]) { | |
524 | 26 | *p_min_offset = offs; | |
525 | 26 | } | |
526 | 216 | } | |
527 | 39 | } | |
528 |
2/2✓ Branch 0 taken 39 times.
✓ Branch 1 taken 6 times.
|
45 | if (p_max_offset) { |
529 | 39 | *p_max_offset = 0; | |
530 |
2/2✓ Branch 0 taken 216 times.
✓ Branch 1 taken 39 times.
|
255 | for (size_t i = 0; i < max_offsets.size(); ++i) { |
531 | 216 | size_t offs = max_offsets[i] + i * src_stride; | |
532 |
4/4✓ Branch 0 taken 191 times.
✓ Branch 1 taken 25 times.
✓ Branch 2 taken 191 times.
✓ Branch 3 taken 25 times.
|
432 | if (src[offs / sizeof(ScalarType)] > |
533 | 216 | src[*p_max_offset / sizeof(ScalarType)]) { | |
534 | 25 | *p_max_offset = offs; | |
535 | 25 | } | |
536 | 216 | } | |
537 | 39 | } | |
538 | 45 | return return_val; | |
539 | 45 | } | |
540 | |||
541 | #define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \ | ||
542 | kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \ | ||
543 | const type *src, size_t src_stride, size_t width, size_t height, \ | ||
544 | size_t *p_min_offset, size_t *p_max_offset, \ | ||
545 | kleidicv_thread_multithreading mt) { \ | ||
546 | return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \ | ||
547 | src_stride, width, height, p_min_offset, \ | ||
548 | p_max_offset, mt); \ | ||
549 | } | ||
550 | |||
551 | 45 | DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t); | |
552 | |||
553 | template <typename F> | ||
554 | 108 | kleidicv_error_t kleidicv_thread_filter(F filter, size_t width, size_t height, | |
555 | size_t channels, size_t kernel_width, | ||
556 | size_t kernel_height, | ||
557 | kleidicv_filter_context_t *context, | ||
558 | kleidicv_thread_multithreading mt) { | ||
559 | 279 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
560 | // The context contains a buffer that can only fit a single row, so can't be | ||
561 | // shared between threads. Since we don't know how many threads there are, | ||
562 | // create and destroy a context every time this callback is called. Only use | ||
563 | // the context argument for the first thread. | ||
564 | 171 | bool create_context = 0 != y_begin; | |
565 | 171 | kleidicv_filter_context_t *thread_context = context; | |
566 |
12/12✓ Branch 0 taken 21 times.
✓ Branch 1 taken 12 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 21 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 21 times.
✓ Branch 7 taken 12 times.
✓ Branch 8 taken 21 times.
✓ Branch 9 taken 12 times.
✓ Branch 10 taken 21 times.
✓ Branch 11 taken 12 times.
|
171 | if (create_context) { |
567 | 126 | kleidicv_error_t context_create_result = kleidicv_filter_context_create( | |
568 | 63 | &thread_context, channels, kernel_width, kernel_height, width, | |
569 | 63 | height); | |
570 | // Excluded from coverage because it's impractical to test this. | ||
571 | // MockMallocToFail can't be used because malloc is used in thread setup. | ||
572 | // GCOVR_EXCL_START | ||
573 | − | if (KLEIDICV_OK != context_create_result) { | |
574 | − | return context_create_result; | |
575 | } | ||
576 | // GCOVR_EXCL_STOP | ||
577 | 63 | } | |
578 | |||
579 | 171 | kleidicv_error_t result = filter(y_begin, y_end, thread_context); | |
580 | |||
581 |
12/12✓ Branch 0 taken 21 times.
✓ Branch 1 taken 12 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 3 times.
✓ Branch 4 taken 21 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 21 times.
✓ Branch 7 taken 12 times.
✓ Branch 8 taken 21 times.
✓ Branch 9 taken 12 times.
✓ Branch 10 taken 21 times.
✓ Branch 11 taken 12 times.
|
171 | if (create_context) { |
582 | 126 | kleidicv_error_t context_release_result = | |
583 | 63 | kleidicv_filter_context_release(thread_context); | |
584 |
6/12✗ Branch 0 not taken.
✓ Branch 1 taken 12 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 3 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 12 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 12 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 12 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 12 times.
|
63 | if (KLEIDICV_OK == result) { |
585 | 63 | result = context_release_result; | |
586 | 63 | } | |
587 | 63 | } | |
588 | 171 | return result; | |
589 | 171 | }; | |
590 | 216 | return parallel_batches(callback, mt, height); | |
591 | 108 | } | |
592 | |||
593 | 156 | kleidicv_error_t kleidicv_thread_gaussian_blur_u8( | |
594 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
595 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
596 | size_t kernel_height, float sigma_x, float sigma_y, | ||
597 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
598 | kleidicv_thread_multithreading mt) { | ||
599 | 156 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
600 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 153 times.
|
156 | if (!fixed_border_type) { |
601 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
602 | } | ||
603 | |||
604 |
4/4✓ Branch 0 taken 24 times.
✓ Branch 1 taken 129 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 129 times.
|
306 | if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_width, |
605 | 153 | kernel_height, sigma_x, sigma_y, | |
606 | 153 | channels, *fixed_border_type)) { | |
607 | 129 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
608 | } | ||
609 | |||
610 |
4/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 21 times.
✓ Branch 2 taken 3 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 3 times.
|
24 | if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) { |
611 | 54 | auto callback = [=](size_t y_begin, size_t y_end, | |
612 | kleidicv_filter_context_t *thread_context) { | ||
613 | 66 | return kleidicv_gaussian_blur_fixed_stripe_u8( | |
614 | 33 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
615 | 33 | channels, kernel_width, kernel_height, sigma_x, sigma_y, | |
616 | 33 | *fixed_border_type, thread_context); | |
617 | }; | ||
618 | 42 | return kleidicv_thread_filter(callback, width, height, channels, | |
619 | 21 | kernel_width, kernel_height, context, mt); | |
620 | 21 | } | |
621 | |||
622 | 9 | auto callback = [=](size_t y_begin, size_t y_end, | |
623 | kleidicv_filter_context_t *thread_context) { | ||
624 | 12 | return kleidicv_gaussian_blur_arbitrary_stripe_u8( | |
625 | 6 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
626 | 6 | channels, kernel_width, kernel_height, sigma_x, sigma_y, | |
627 | 6 | *fixed_border_type, thread_context); | |
628 | }; | ||
629 | 6 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
630 | 3 | kernel_height, context, mt); | |
631 | 156 | } | |
632 | |||
633 | 81 | kleidicv_error_t kleidicv_thread_separable_filter_2d_u8( | |
634 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
635 | size_t width, size_t height, size_t channels, const uint8_t *kernel_x, | ||
636 | size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, | ||
637 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
638 | kleidicv_thread_multithreading mt) { | ||
639 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 24 times.
|
162 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
640 | 81 | kernel_height)) { | |
641 | 57 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
642 | } | ||
643 | |||
644 | 24 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
645 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 3 times.
|
24 | if (!fixed_border_type) { |
646 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
647 | } | ||
648 | |||
649 | 54 | auto callback = [=](size_t y_begin, size_t y_end, | |
650 | kleidicv_filter_context_t *thread_context) { | ||
651 | 66 | return kleidicv_separable_filter_2d_stripe_u8( | |
652 | 33 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
653 | 33 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
654 | 33 | *fixed_border_type, thread_context); | |
655 | }; | ||
656 | 42 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
657 | 21 | kernel_height, context, mt); | |
658 | 81 | } | |
659 | |||
660 | 81 | kleidicv_error_t kleidicv_thread_separable_filter_2d_u16( | |
661 | const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, | ||
662 | size_t width, size_t height, size_t channels, const uint16_t *kernel_x, | ||
663 | size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, | ||
664 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
665 | kleidicv_thread_multithreading mt) { | ||
666 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 24 times.
|
162 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
667 | 81 | kernel_height)) { | |
668 | 57 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
669 | } | ||
670 | |||
671 | 24 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
672 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 3 times.
|
24 | if (!fixed_border_type) { |
673 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
674 | } | ||
675 | |||
676 | 54 | auto callback = [=](size_t y_begin, size_t y_end, | |
677 | kleidicv_filter_context_t *thread_context) { | ||
678 | 66 | return kleidicv_separable_filter_2d_stripe_u16( | |
679 | 33 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
680 | 33 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
681 | 33 | *fixed_border_type, thread_context); | |
682 | }; | ||
683 | 42 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
684 | 21 | kernel_height, context, mt); | |
685 | 81 | } | |
686 | |||
687 | 81 | kleidicv_error_t kleidicv_thread_separable_filter_2d_s16( | |
688 | const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
689 | size_t width, size_t height, size_t channels, const int16_t *kernel_x, | ||
690 | size_t kernel_width, const int16_t *kernel_y, size_t kernel_height, | ||
691 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
692 | kleidicv_thread_multithreading mt) { | ||
693 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 24 times.
|
162 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
694 | 81 | kernel_height)) { | |
695 | 57 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
696 | } | ||
697 | |||
698 | 24 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
699 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 3 times.
|
24 | if (!fixed_border_type) { |
700 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
701 | } | ||
702 | |||
703 | 54 | auto callback = [=](size_t y_begin, size_t y_end, | |
704 | kleidicv_filter_context_t *thread_context) { | ||
705 | 66 | return kleidicv_separable_filter_2d_stripe_s16( | |
706 | 33 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
707 | 33 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
708 | 33 | *fixed_border_type, thread_context); | |
709 | }; | ||
710 | 42 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
711 | 21 | kernel_height, context, mt); | |
712 | 81 | } | |
713 | |||
714 | 81 | kleidicv_error_t kleidicv_thread_blur_and_downsample_u8( | |
715 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
716 | uint8_t *dst, size_t dst_stride, size_t channels, | ||
717 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
718 | kleidicv_thread_multithreading mt) { | ||
719 |
4/4✓ Branch 0 taken 57 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 24 times.
|
162 | if (!kleidicv::blur_and_downsample_is_implemented(src_width, src_height, |
720 | 81 | channels)) { | |
721 | 57 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
722 | } | ||
723 | |||
724 | 24 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
725 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 3 times.
|
24 | if (!fixed_border_type) { |
726 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
727 | } | ||
728 | |||
729 | 54 | auto callback = [=](unsigned y_begin, unsigned y_end, | |
730 | kleidicv_filter_context_t *thread_context) { | ||
731 | 66 | return kleidicv_blur_and_downsample_stripe_u8( | |
732 | 33 | src, src_stride, src_width, src_height, dst, dst_stride, y_begin, y_end, | |
733 | 33 | channels, *fixed_border_type, thread_context); | |
734 | }; | ||
735 | 42 | return kleidicv_thread_filter(callback, src_width, src_height, channels, 5, 5, | |
736 | 21 | context, mt); | |
737 | 81 | } | |
738 | |||
739 | 153 | kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8( | |
740 | const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
741 | size_t width, size_t height, size_t channels, | ||
742 | kleidicv_thread_multithreading mt) { | ||
743 |
2/2✓ Branch 0 taken 69 times.
✓ Branch 1 taken 84 times.
|
153 | if (!kleidicv::sobel_is_implemented(width, height, 3)) { |
744 | 69 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
745 | } | ||
746 | |||
747 | 204 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
748 | 240 | return kleidicv_sobel_3x3_horizontal_stripe_s16_u8( | |
749 | 120 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
750 | 120 | channels); | |
751 | }; | ||
752 | 84 | return parallel_batches(callback, mt, height); | |
753 | 153 | } | |
754 | |||
755 | 399 | kleidicv_error_t kleidicv_thread_median_blur_u8( | |
756 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
757 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
758 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
759 | kleidicv_thread_multithreading mt) { | ||
760 | 798 | auto result_pair = kleidicv::median_blur_is_implemented( | |
761 | 399 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
762 | 399 | kernel_height, border_type); | |
763 | |||
764 | 399 | auto checks_result = result_pair.first; | |
765 | 399 | auto fixed_border_type = result_pair.second; | |
766 |
2/2✓ Branch 0 taken 312 times.
✓ Branch 1 taken 87 times.
|
399 | if (checks_result != KLEIDICV_OK) { |
767 | 312 | return checks_result; | |
768 | } | ||
769 | |||
770 |
2/2✓ Branch 0 taken 75 times.
✓ Branch 1 taken 12 times.
|
87 | if (kernel_width <= 7) { |
771 | 189 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
772 | 228 | return kleidicv_median_blur_sorting_network_stripe_u8( | |
773 | 114 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
774 | 114 | channels, kernel_width, kernel_height, fixed_border_type); | |
775 | }; | ||
776 | 75 | return parallel_batches(callback, mt, height); | |
777 | 75 | } | |
778 | |||
779 |
3/4✓ Branch 0 taken 12 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 9 times.
|
12 | if (kernel_width > 7 && kernel_width <= 15) { |
780 | 27 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
781 | 36 | return kleidicv_median_blur_small_hist_stripe_u8( | |
782 | 18 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
783 | 18 | channels, kernel_width, kernel_height, fixed_border_type); | |
784 | }; | ||
785 | 9 | return parallel_batches(callback, mt, height); | |
786 | 9 | } | |
787 | |||
788 | 9 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
789 | 12 | return kleidicv_median_blur_large_hist_stripe_u8( | |
790 | 6 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
791 | 6 | channels, kernel_width, kernel_height, fixed_border_type); | |
792 | }; | ||
793 | 3 | return parallel_batches(callback, mt, height); | |
794 | 399 | } | |
795 | |||
796 | 399 | kleidicv_error_t kleidicv_thread_median_blur_s16( | |
797 | const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
798 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
799 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
800 | kleidicv_thread_multithreading mt) { | ||
801 | 798 | auto result_pair = kleidicv::median_blur_is_implemented( | |
802 | 399 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
803 | 399 | kernel_height, border_type); | |
804 | |||
805 | 399 | auto checks_result = result_pair.first; | |
806 | 399 | auto fixed_border_type = result_pair.second; | |
807 |
2/2✓ Branch 0 taken 324 times.
✓ Branch 1 taken 75 times.
|
399 | if (checks_result != KLEIDICV_OK) { |
808 | 324 | return checks_result; | |
809 | } | ||
810 | |||
811 | 189 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
812 | 228 | return kleidicv_median_blur_sorting_network_stripe_s16( | |
813 | 114 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
814 | 114 | channels, kernel_width, kernel_height, fixed_border_type); | |
815 | }; | ||
816 | 75 | return parallel_batches(callback, mt, height); | |
817 | 399 | } | |
818 | |||
819 | 399 | kleidicv_error_t kleidicv_thread_median_blur_u16( | |
820 | const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, | ||
821 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
822 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
823 | kleidicv_thread_multithreading mt) { | ||
824 | 798 | auto result_pair = kleidicv::median_blur_is_implemented( | |
825 | 399 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
826 | 399 | kernel_height, border_type); | |
827 | |||
828 | 399 | auto checks_result = result_pair.first; | |
829 | 399 | auto fixed_border_type = result_pair.second; | |
830 |
2/2✓ Branch 0 taken 324 times.
✓ Branch 1 taken 75 times.
|
399 | if (checks_result != KLEIDICV_OK) { |
831 | 324 | return checks_result; | |
832 | } | ||
833 | |||
834 | 189 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
835 | 228 | return kleidicv_median_blur_sorting_network_stripe_u16( | |
836 | 114 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
837 | 114 | channels, kernel_width, kernel_height, fixed_border_type); | |
838 | }; | ||
839 | 75 | return parallel_batches(callback, mt, height); | |
840 | 399 | } | |
841 | |||
842 | 399 | kleidicv_error_t kleidicv_thread_median_blur_f32( | |
843 | const float *src, size_t src_stride, float *dst, size_t dst_stride, | ||
844 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
845 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
846 | kleidicv_thread_multithreading mt) { | ||
847 | 798 | auto result_pair = kleidicv::median_blur_is_implemented( | |
848 | 399 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
849 | 399 | kernel_height, border_type); | |
850 | |||
851 | 399 | auto checks_result = result_pair.first; | |
852 | 399 | auto fixed_border_type = result_pair.second; | |
853 |
2/2✓ Branch 0 taken 324 times.
✓ Branch 1 taken 75 times.
|
399 | if (checks_result != KLEIDICV_OK) { |
854 | 324 | return checks_result; | |
855 | } | ||
856 | |||
857 | 189 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
858 | 228 | return kleidicv_median_blur_sorting_network_stripe_f32( | |
859 | 114 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
860 | 114 | channels, kernel_width, kernel_height, fixed_border_type); | |
861 | }; | ||
862 | 75 | return parallel_batches(callback, mt, height); | |
863 | 399 | } | |
864 | |||
865 | 153 | kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( | |
866 | const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
867 | size_t width, size_t height, size_t channels, | ||
868 | kleidicv_thread_multithreading mt) { | ||
869 |
2/2✓ Branch 0 taken 69 times.
✓ Branch 1 taken 84 times.
|
153 | if (!kleidicv::sobel_is_implemented(width, height, 3)) { |
870 | 69 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
871 | } | ||
872 | |||
873 | 204 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
874 | 240 | return kleidicv_sobel_3x3_vertical_stripe_s16_u8(src, src_stride, dst, | |
875 | 120 | dst_stride, width, height, | |
876 | 120 | y_begin, y_end, channels); | |
877 | }; | ||
878 | 84 | return parallel_batches(callback, mt, height); | |
879 | 153 | } | |
880 | |||
881 | 78 | kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8( | |
882 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
883 | size_t src_channels, int16_t *dst, size_t dst_stride, | ||
884 | kleidicv_thread_multithreading mt) { | ||
885 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 75 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 75 times.
|
156 | if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height, |
886 | 78 | src_channels)) { | |
887 | 3 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
888 | } | ||
889 | |||
890 | 183 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
891 | 216 | return kleidicv_scharr_interleaved_stripe_s16_u8( | |
892 | 108 | src, src_stride, src_width, src_height, src_channels, dst, dst_stride, | |
893 | 108 | y_begin, y_end); | |
894 | }; | ||
895 | |||
896 | // height is decremented by 2 as the result has less rows. | ||
897 | 75 | return parallel_batches(callback, mt, src_height - 2); | |
898 | 78 | } | |
899 | |||
900 | 96 | kleidicv_error_t kleidicv_thread_resize_to_quarter_u8( | |
901 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
902 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
903 | kleidicv_thread_multithreading mt) { | ||
904 | 192 | auto callback = [=](unsigned begin, unsigned end) { | |
905 | 96 | size_t src_begin = size_t{begin} * 2; | |
906 | 96 | size_t src_end = std::min<size_t>(src_height, size_t{end} * 2); | |
907 | 96 | size_t dst_begin = begin; | |
908 | 96 | size_t dst_end = std::min<size_t>(dst_height, end); | |
909 | |||
910 | // half of odd height is rounded towards zero? | ||
911 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 88 times.
|
96 | if (dst_begin == dst_end) { |
912 | 8 | return KLEIDICV_OK; | |
913 | } | ||
914 | |||
915 | 176 | return kleidicv_resize_to_quarter_u8( | |
916 | 88 | src + src_begin * src_stride, src_stride, src_width, | |
917 | 88 | src_end - src_begin, dst + dst_begin * dst_stride, dst_stride, | |
918 | 88 | dst_width, dst_end - dst_begin); | |
919 | 96 | }; | |
920 | 192 | return parallel_batches(callback, mt, (src_height + 1) / 2); | |
921 | 96 | } | |
922 | |||
923 | 100 | kleidicv_error_t kleidicv_thread_resize_linear_u8( | |
924 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
925 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
926 | kleidicv_thread_multithreading mt) { | ||
927 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 96 times.
|
200 | if (!kleidicv::resize_linear_u8_is_implemented(src_width, src_height, |
928 | 100 | dst_width, dst_height)) { | |
929 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
930 | } | ||
931 | 200 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
932 | 208 | return kleidicv_resize_linear_stripe_u8( | |
933 | 104 | src, src_stride, src_width, src_height, y_begin, | |
934 | 104 | std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width, | |
935 | 104 | dst_height); | |
936 | }; | ||
937 | 96 | return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1)); | |
938 | 100 | } | |
939 | |||
940 | 148 | kleidicv_error_t kleidicv_thread_resize_linear_f32( | |
941 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
942 | float *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
943 | kleidicv_thread_multithreading mt) { | ||
944 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 144 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 144 times.
|
296 | if (!kleidicv::resize_linear_f32_is_implemented(src_width, src_height, |
945 | 148 | dst_width, dst_height)) { | |
946 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
947 | } | ||
948 | 300 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
949 | 312 | return kleidicv_resize_linear_stripe_f32( | |
950 | 156 | src, src_stride, src_width, src_height, y_begin, | |
951 | 156 | std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width, | |
952 | 156 | dst_height); | |
953 | }; | ||
954 | 144 | return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1)); | |
955 | 148 | } | |
956 | |||
957 | 156 | kleidicv_error_t kleidicv_thread_remap_s16_u8( | |
958 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
959 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
960 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
961 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
962 | kleidicv_thread_multithreading mt) { | ||
963 |
4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 150 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 150 times.
|
312 | if (!kleidicv::remap_s16_is_implemented<uint8_t>(src_stride, src_width, |
964 | 156 | src_height, dst_width, | |
965 | 156 | border_type, channels)) { | |
966 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
967 | } | ||
968 | 366 | auto callback = [=](unsigned begin, unsigned end) { | |
969 | 432 | return kleidicv_remap_s16_u8( | |
970 | 216 | src, src_stride, src_width, src_height, | |
971 | 216 | dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width, | |
972 | 216 | end - begin, channels, | |
973 | 216 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
974 | 216 | mapxy_stride, border_type, border_value); | |
975 | }; | ||
976 | 150 | return parallel_batches(callback, mt, dst_height); | |
977 | 156 | } | |
978 | |||
979 | 156 | kleidicv_error_t kleidicv_thread_remap_s16_u16( | |
980 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
981 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
982 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
983 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
984 | kleidicv_thread_multithreading mt) { | ||
985 |
4/4✓ Branch 0 taken 6 times.
✓ Branch 1 taken 150 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 150 times.
|
312 | if (!kleidicv::remap_s16_is_implemented<uint16_t>(src_stride, src_width, |
986 | 156 | src_height, dst_width, | |
987 | 156 | border_type, channels)) { | |
988 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
989 | } | ||
990 | 366 | auto callback = [=](unsigned begin, unsigned end) { | |
991 | 432 | return kleidicv_remap_s16_u16( | |
992 | 216 | src, src_stride, src_width, src_height, | |
993 | 216 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
994 | 216 | dst_stride, dst_width, end - begin, channels, | |
995 | 216 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
996 | 216 | mapxy_stride, border_type, border_value); | |
997 | }; | ||
998 | 150 | return parallel_batches(callback, mt, dst_height); | |
999 | 156 | } | |
1000 | |||
1001 | 306 | kleidicv_error_t kleidicv_thread_remap_s16point5_u8( | |
1002 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1003 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1004 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
1005 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
1006 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
1007 | kleidicv_thread_multithreading mt) { | ||
1008 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 300 times.
|
306 | if (!kleidicv::remap_s16point5_is_implemented<uint8_t>( |
1009 | 306 | src_stride, src_width, src_height, dst_width, border_type, | |
1010 | 306 | channels)) { | |
1011 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1012 | } | ||
1013 | 732 | auto callback = [=](unsigned begin, unsigned end) { | |
1014 | 864 | return kleidicv_remap_s16point5_u8( | |
1015 | 432 | src, src_stride, src_width, src_height, | |
1016 | 432 | dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width, | |
1017 | 432 | end - begin, channels, | |
1018 | 432 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
1019 | 432 | mapxy_stride, | |
1020 | 864 | mapfrac + | |
1021 | 432 | static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)), | |
1022 | 432 | mapfrac_stride, border_type, border_value); | |
1023 | }; | ||
1024 | 300 | return parallel_batches(callback, mt, dst_height); | |
1025 | 306 | } | |
1026 | |||
1027 | 306 | kleidicv_error_t kleidicv_thread_remap_s16point5_u16( | |
1028 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1029 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1030 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
1031 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
1032 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
1033 | kleidicv_thread_multithreading mt) { | ||
1034 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 300 times.
|
306 | if (!kleidicv::remap_s16point5_is_implemented<uint16_t>( |
1035 | 306 | src_stride, src_width, src_height, dst_width, border_type, | |
1036 | 306 | channels)) { | |
1037 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1038 | } | ||
1039 | 732 | auto callback = [=](unsigned begin, unsigned end) { | |
1040 | 864 | return kleidicv_remap_s16point5_u16( | |
1041 | 432 | src, src_stride, src_width, src_height, | |
1042 | 432 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
1043 | 432 | dst_stride, dst_width, end - begin, channels, | |
1044 | 432 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
1045 | 432 | mapxy_stride, | |
1046 | 864 | mapfrac + | |
1047 | 432 | static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)), | |
1048 | 432 | mapfrac_stride, border_type, border_value); | |
1049 | }; | ||
1050 | 300 | return parallel_batches(callback, mt, dst_height); | |
1051 | 306 | } | |
1052 | |||
1053 | 306 | kleidicv_error_t kleidicv_thread_remap_f32_u8( | |
1054 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1055 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1056 | size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, | ||
1057 | size_t mapy_stride, kleidicv_interpolation_type_t interpolation, | ||
1058 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
1059 | kleidicv_thread_multithreading mt) { | ||
1060 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 300 times.
|
306 | if (!kleidicv::remap_f32_is_implemented<uint8_t>( |
1061 | 306 | src_stride, src_width, src_height, dst_width, dst_height, border_type, | |
1062 | 306 | channels, interpolation)) { | |
1063 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1064 | } | ||
1065 | 732 | auto callback = [=](unsigned begin, unsigned end) { | |
1066 | 864 | return kleidicv_remap_f32_u8( | |
1067 | 432 | src, src_stride, src_width, src_height, | |
1068 | 432 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)), | |
1069 | 432 | dst_stride, dst_width, end - begin, channels, | |
1070 | 432 | mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)), | |
1071 | 432 | mapx_stride, | |
1072 | 432 | mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)), | |
1073 | 432 | mapy_stride, interpolation, border_type, border_value); | |
1074 | }; | ||
1075 | 300 | return parallel_batches(callback, mt, dst_height); | |
1076 | 306 | } | |
1077 | |||
1078 | 306 | kleidicv_error_t kleidicv_thread_remap_f32_u16( | |
1079 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1080 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1081 | size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, | ||
1082 | size_t mapy_stride, kleidicv_interpolation_type_t interpolation, | ||
1083 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
1084 | kleidicv_thread_multithreading mt) { | ||
1085 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 300 times.
|
306 | if (!kleidicv::remap_f32_is_implemented<uint16_t>( |
1086 | 306 | src_stride, src_width, src_height, dst_width, dst_height, border_type, | |
1087 | 306 | channels, interpolation)) { | |
1088 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1089 | } | ||
1090 | 732 | auto callback = [=](unsigned begin, unsigned end) { | |
1091 | 864 | return kleidicv_remap_f32_u16( | |
1092 | 432 | src, src_stride, src_width, src_height, | |
1093 | 432 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
1094 | 432 | dst_stride, dst_width, end - begin, channels, | |
1095 | 432 | mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)), | |
1096 | 432 | mapx_stride, | |
1097 | 432 | mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)), | |
1098 | 432 | mapy_stride, interpolation, border_type, border_value); | |
1099 | }; | ||
1100 | 300 | return parallel_batches(callback, mt, dst_height); | |
1101 | 306 | } | |
1102 | |||
1103 | 162 | kleidicv_error_t kleidicv_thread_warp_perspective_u8( | |
1104 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
1105 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
1106 | const float transformation[9], size_t channels, | ||
1107 | kleidicv_interpolation_type_t interpolation, | ||
1108 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
1109 | kleidicv_thread_multithreading mt) { | ||
1110 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 150 times.
|
162 | if (!kleidicv::warp_perspective_is_implemented<uint8_t>( |
1111 | 162 | dst_width, channels, interpolation, border_type)) { | |
1112 | 12 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
1113 | } | ||
1114 | |||
1115 | 366 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
1116 | 432 | return kleidicv_warp_perspective_stripe_u8( | |
1117 | 216 | src, src_stride, src_width, src_height, dst, dst_stride, dst_width, | |
1118 | 216 | dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1), | |
1119 | 216 | transformation, channels, interpolation, border_type, border_value); | |
1120 | }; | ||
1121 | 150 | return parallel_batches(callback, mt, dst_height); | |
1122 | 162 | } | |
1123 |