| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv_thread/kleidicv_thread.h" | ||
| 6 | |||
| 7 | #include <algorithm> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <cstdint> | ||
| 10 | #include <functional> | ||
| 11 | #include <limits> | ||
| 12 | #include <vector> | ||
| 13 | |||
| 14 | #include "kleidicv/arithmetics/rotate.h" | ||
| 15 | #include "kleidicv/arithmetics/scale.h" | ||
| 16 | #include "kleidicv/conversions/rgb_to_yuv_420.h" | ||
| 17 | #include "kleidicv/conversions/yuv_420_to_rgb.h" | ||
| 18 | #include "kleidicv/ctypes.h" | ||
| 19 | #include "kleidicv/filters/blur_and_downsample.h" | ||
| 20 | #include "kleidicv/filters/gaussian_blur.h" | ||
| 21 | #include "kleidicv/filters/median_blur.h" | ||
| 22 | #include "kleidicv/filters/scharr.h" | ||
| 23 | #include "kleidicv/filters/separable_filter_2d.h" | ||
| 24 | #include "kleidicv/filters/sobel.h" | ||
| 25 | #include "kleidicv/kleidicv.h" | ||
| 26 | #include "kleidicv/resize/resize_linear.h" | ||
| 27 | #include "kleidicv/transform/remap.h" | ||
| 28 | #include "kleidicv/transform/warp_perspective.h" | ||
| 29 | |||
| 30 | typedef std::function<kleidicv_error_t(unsigned, unsigned)> FunctionCallback; | ||
| 31 | |||
| 32 | 16133 | static kleidicv_error_t kleidicv_thread_std_function_callback( | |
| 33 | unsigned task_begin, unsigned task_end, void *data) { | ||
| 34 | 16133 | auto *callback = reinterpret_cast<FunctionCallback *>(data); | |
| 35 | 32266 | return (*callback)(task_begin, task_end); | |
| 36 | 16133 | } | |
| 37 | |||
| 38 | // Operations in the Neon backend have both a vector path and a scalar path. | ||
| 39 | // The vector path is used to process most data and the scalar path is used to | ||
| 40 | // process the parts of the data that don't fit into the vector width. | ||
| 41 | // For floating point operations in particular, the results may be very slightly | ||
| 42 | // different between vector and scalar paths. | ||
| 43 | // When using multithreading, images are divided into parts to be processed by | ||
| 44 | // each thread, and this could change which parts of the data end up being | ||
| 45 | // processed by the vector and scalar paths. Since the threading may be | ||
| 46 | // non-deterministic in how it divides up the image, this non-determinism could | ||
| 47 | // leak through in the values of the output. This could cause subtle bugs. | ||
| 48 | // | ||
| 49 | // To avoid this problem, this function passes data to each thread in batches | ||
| 50 | // that are a multiple of the Neon vector width in size (16 bytes). The | ||
| 51 | // exception is the last batch, which may be longer in order to extend to the | ||
| 52 | // end of the data. No batch can be shorter than vector length as this could | ||
| 53 | // cause different behaviour for operations that try to avoid the tail loop (see | ||
| 54 | // the TryToAvoidTailLoop class) - this technique only works if the data is | ||
| 55 | // longer than vector length. | ||
| 56 | // | ||
| 57 | // Typically with how this function is used, batches will be 16 image rows or | ||
| 58 | // row pairs, which is likely to be far coarser alignment than is needed. | ||
| 59 | // However it's unlikely that threading on a finer-grained level would provide a | ||
| 60 | // performance benefit. | ||
| 61 | template <typename Callback> | ||
| 62 | 11904 | inline kleidicv_error_t parallel_batches(Callback callback, | |
| 63 | kleidicv_thread_multithreading mt, | ||
| 64 | unsigned count, | ||
| 65 | unsigned min_batch_size = 16) { | ||
| 66 | 11904 | const unsigned task_count = std::max(1U, (count) / min_batch_size); | |
| 67 | 28037 | FunctionCallback f = [=](unsigned task_begin, unsigned task_end) { | |
| 68 | 16133 | unsigned begin = task_begin * min_batch_size, | |
| 69 | 16133 | end = task_end * min_batch_size; | |
| 70 |
140/142✓ Branch 0 taken 792 times.
✓ Branch 1 taken 1800 times.
✓ Branch 2 taken 48 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 44 times.
✓ Branch 5 taken 100 times.
✓ Branch 6 taken 44 times.
✓ Branch 7 taken 100 times.
✓ Branch 8 taken 44 times.
✓ Branch 9 taken 100 times.
✓ Branch 10 taken 44 times.
✓ Branch 11 taken 100 times.
✓ Branch 12 taken 88 times.
✓ Branch 13 taken 200 times.
✓ Branch 14 taken 44 times.
✓ Branch 15 taken 100 times.
✓ Branch 16 taken 44 times.
✓ Branch 17 taken 112 times.
✓ Branch 18 taken 48 times.
✓ Branch 19 taken 120 times.
✓ Branch 20 taken 44 times.
✓ Branch 21 taken 100 times.
✓ Branch 22 taken 132 times.
✓ Branch 23 taken 300 times.
✓ Branch 24 taken 264 times.
✓ Branch 25 taken 600 times.
✓ Branch 26 taken 132 times.
✓ Branch 27 taken 300 times.
✓ Branch 28 taken 132 times.
✓ Branch 29 taken 300 times.
✓ Branch 30 taken 132 times.
✓ Branch 31 taken 300 times.
✓ Branch 32 taken 88 times.
✓ Branch 33 taken 200 times.
✓ Branch 34 taken 88 times.
✓ Branch 35 taken 200 times.
✓ Branch 36 taken 88 times.
✓ Branch 37 taken 200 times.
✓ Branch 38 taken 44 times.
✓ Branch 39 taken 100 times.
✓ Branch 40 taken 44 times.
✓ Branch 41 taken 100 times.
✓ Branch 42 taken 44 times.
✓ Branch 43 taken 100 times.
✓ Branch 44 taken 44 times.
✓ Branch 45 taken 100 times.
✓ Branch 46 taken 44 times.
✓ Branch 47 taken 100 times.
✓ Branch 48 taken 44 times.
✓ Branch 49 taken 100 times.
✓ Branch 50 taken 96 times.
✓ Branch 51 taken 160 times.
✓ Branch 52 taken 5 times.
✓ Branch 53 taken 135 times.
✓ Branch 54 taken 5 times.
✓ Branch 55 taken 135 times.
✓ Branch 56 taken 5 times.
✓ Branch 57 taken 135 times.
✓ Branch 58 taken 5 times.
✓ Branch 59 taken 135 times.
✓ Branch 60 taken 5 times.
✓ Branch 61 taken 70 times.
✓ Branch 62 taken 5 times.
✓ Branch 63 taken 70 times.
✓ Branch 64 taken 5 times.
✓ Branch 65 taken 70 times.
✓ Branch 66 taken 5 times.
✓ Branch 67 taken 70 times.
✓ Branch 68 taken 5 times.
✓ Branch 69 taken 70 times.
✓ Branch 70 taken 5 times.
✓ Branch 71 taken 70 times.
✓ Branch 72 taken 5 times.
✓ Branch 73 taken 70 times.
✓ Branch 74 taken 5 times.
✓ Branch 75 taken 70 times.
✗ Branch 76 not taken.
✓ Branch 77 taken 760 times.
✓ Branch 78 taken 4 times.
✓ Branch 79 taken 60 times.
✓ Branch 80 taken 4 times.
✓ Branch 81 taken 60 times.
✓ Branch 82 taken 4 times.
✓ Branch 83 taken 60 times.
✓ Branch 84 taken 4 times.
✓ Branch 85 taken 60 times.
✓ Branch 86 taken 4 times.
✓ Branch 87 taken 60 times.
✓ Branch 88 taken 4 times.
✓ Branch 89 taken 68 times.
✓ Branch 90 taken 4 times.
✓ Branch 91 taken 60 times.
✓ Branch 92 taken 16 times.
✓ Branch 93 taken 28 times.
✓ Branch 94 taken 4 times.
✓ Branch 95 taken 4 times.
✓ Branch 96 taken 16 times.
✓ Branch 97 taken 28 times.
✓ Branch 98 taken 16 times.
✓ Branch 99 taken 28 times.
✓ Branch 100 taken 16 times.
✓ Branch 101 taken 28 times.
✓ Branch 102 taken 16 times.
✓ Branch 103 taken 28 times.
✓ Branch 104 taken 48 times.
✓ Branch 105 taken 112 times.
✓ Branch 106 taken 52 times.
✓ Branch 107 taken 100 times.
✓ Branch 108 taken 12 times.
✓ Branch 109 taken 12 times.
✓ Branch 110 taken 4 times.
✓ Branch 111 taken 4 times.
✓ Branch 112 taken 52 times.
✓ Branch 113 taken 100 times.
✓ Branch 114 taken 52 times.
✓ Branch 115 taken 100 times.
✓ Branch 116 taken 52 times.
✓ Branch 117 taken 100 times.
✓ Branch 118 taken 48 times.
✓ Branch 119 taken 112 times.
✓ Branch 120 taken 44 times.
✓ Branch 121 taken 100 times.
✗ Branch 122 not taken.
✓ Branch 123 taken 120 times.
✓ Branch 124 taken 10 times.
✓ Branch 125 taken 120 times.
✓ Branch 126 taken 15 times.
✓ Branch 127 taken 180 times.
✓ Branch 128 taken 88 times.
✓ Branch 129 taken 200 times.
✓ Branch 130 taken 88 times.
✓ Branch 131 taken 200 times.
✓ Branch 132 taken 176 times.
✓ Branch 133 taken 400 times.
✓ Branch 134 taken 176 times.
✓ Branch 135 taken 400 times.
✓ Branch 136 taken 176 times.
✓ Branch 137 taken 400 times.
✓ Branch 138 taken 176 times.
✓ Branch 139 taken 400 times.
✓ Branch 140 taken 88 times.
✓ Branch 141 taken 200 times.
|
16133 | if (task_end == task_count) { |
| 71 | 11904 | end = count; | |
| 72 | 11904 | } | |
| 73 | 32266 | return callback(begin, end); | |
| 74 | 16133 | }; | |
| 75 | 23808 | return mt.parallel(kleidicv_thread_std_function_callback, &f, | |
| 76 | 11904 | mt.parallel_data, task_count); | |
| 77 | 11904 | } | |
| 78 | |||
| 79 | template <typename SrcT, typename DstT, typename F, typename... Args> | ||
| 80 | 2952 | inline kleidicv_error_t kleidicv_thread_unary_op_impl( | |
| 81 | F f, kleidicv_thread_multithreading mt, const SrcT *src, size_t src_stride, | ||
| 82 | DstT *dst, size_t dst_stride, size_t width, size_t height, Args... args) { | ||
| 83 | 7188 | auto callback = [=](unsigned begin, unsigned end) { | |
| 84 | 8472 | return f(src + static_cast<ptrdiff_t>(begin * src_stride / sizeof(SrcT)), | |
| 85 | 4236 | src_stride, | |
| 86 | 4236 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)), | |
| 87 | 4236 | dst_stride, width, end - begin, args...); | |
| 88 | }; | ||
| 89 | 5904 | return parallel_batches(callback, mt, height); | |
| 90 | 2952 | } | |
| 91 | |||
| 92 | template <typename SrcT, typename DstT, typename F, typename... Args> | ||
| 93 | 3000 | inline kleidicv_error_t kleidicv_thread_binary_op_impl( | |
| 94 | F f, kleidicv_thread_multithreading mt, const SrcT *src_a, | ||
| 95 | size_t src_a_stride, const SrcT *src_b, size_t src_b_stride, DstT *dst, | ||
| 96 | size_t dst_stride, size_t width, size_t height, Args... args) { | ||
| 97 | 7320 | auto callback = [=](unsigned begin, unsigned end) { | |
| 98 | 8640 | return f( | |
| 99 | 4320 | src_a + static_cast<ptrdiff_t>(begin * src_a_stride / sizeof(SrcT)), | |
| 100 | 4320 | src_a_stride, | |
| 101 | 4320 | src_b + static_cast<ptrdiff_t>(begin * src_b_stride / sizeof(SrcT)), | |
| 102 | 4320 | src_b_stride, | |
| 103 | 4320 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(DstT)), | |
| 104 | 4320 | dst_stride, width, end - begin, args...); | |
| 105 | }; | ||
| 106 | 6000 | return parallel_batches(callback, mt, height); | |
| 107 | 3000 | } | |
| 108 | |||
| 109 | #define KLEIDICV_THREAD_UNARY_OP_IMPL(suffix, src_type, dst_type) \ | ||
| 110 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
| 111 | const src_type *src, size_t src_stride, dst_type *dst, \ | ||
| 112 | size_t dst_stride, size_t width, size_t height, \ | ||
| 113 | kleidicv_thread_multithreading mt) { \ | ||
| 114 | return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \ | ||
| 115 | src_stride, dst, dst_stride, width, \ | ||
| 116 | height); \ | ||
| 117 | } | ||
| 118 | |||
| 119 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgb_u8, uint8_t, uint8_t); | |
| 120 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(gray_to_rgba_u8, uint8_t, uint8_t); | |
| 121 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgr_u8, uint8_t, uint8_t); | |
| 122 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgb_u8, uint8_t, uint8_t); | |
| 123 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgra_u8, uint8_t, uint8_t); | |
| 124 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgba_u8, uint8_t, uint8_t); | |
| 125 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_bgra_u8, uint8_t, uint8_t); | |
| 126 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_rgba_u8, uint8_t, uint8_t); | |
| 127 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_bgr_u8, uint8_t, uint8_t); | |
| 128 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_rgb_u8, uint8_t, uint8_t); | |
| 129 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgr_u8, uint8_t, uint8_t); | |
| 130 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_bgra_u8, uint8_t, uint8_t); | |
| 131 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgb_u8, uint8_t, uint8_t); | |
| 132 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(yuv_to_rgba_u8, uint8_t, uint8_t); | |
| 133 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(bgr_to_yuv_u8, uint8_t, uint8_t); | |
| 134 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgb_to_yuv_u8, uint8_t, uint8_t); | |
| 135 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(bgra_to_yuv_u8, uint8_t, uint8_t); | |
| 136 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(rgba_to_yuv_u8, uint8_t, uint8_t); | |
| 137 | 120 | KLEIDICV_THREAD_UNARY_OP_IMPL(exp_f32, float, float); | |
| 138 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_s8, float, int8_t); | |
| 139 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(f32_to_u8, float, uint8_t); | |
| 140 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(s8_to_f32, int8_t, float); | |
| 141 | 100 | KLEIDICV_THREAD_UNARY_OP_IMPL(u8_to_f32, uint8_t, float); | |
| 142 | |||
| 143 | #define KLEIDICV_THREAD_INRANGE_OP_IMPL(suffix, src_type, dst_type) \ | ||
| 144 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
| 145 | const src_type *src, size_t src_stride, dst_type *dst, \ | ||
| 146 | size_t dst_stride, size_t width, size_t height, src_type lower_bound, \ | ||
| 147 | src_type upper_bound, kleidicv_thread_multithreading mt) { \ | ||
| 148 | return kleidicv_thread_unary_op_impl(kleidicv_##suffix, mt, src, \ | ||
| 149 | src_stride, dst, dst_stride, width, \ | ||
| 150 | height, lower_bound, upper_bound); \ | ||
| 151 | } | ||
| 152 | |||
| 153 | 100 | KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_u8, uint8_t, uint8_t); | |
| 154 | 100 | KLEIDICV_THREAD_INRANGE_OP_IMPL(in_range_f32, float, uint8_t); | |
| 155 | |||
| 156 | 100 | kleidicv_error_t kleidicv_thread_threshold_binary_u8( | |
| 157 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 158 | size_t width, size_t height, uint8_t threshold, uint8_t value, | ||
| 159 | kleidicv_thread_multithreading mt) { | ||
| 160 | 200 | return kleidicv_thread_unary_op_impl(kleidicv_threshold_binary_u8, mt, src, | |
| 161 | 100 | src_stride, dst, dst_stride, width, | |
| 162 | 100 | height, threshold, value); | |
| 163 | } | ||
| 164 | |||
| 165 | 124 | kleidicv_error_t kleidicv_thread_scale_u8(const uint8_t *src, size_t src_stride, | |
| 166 | uint8_t *dst, size_t dst_stride, | ||
| 167 | size_t width, size_t height, | ||
| 168 | double scale, double shift, | ||
| 169 | kleidicv_thread_multithreading mt) { | ||
| 170 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 120 times.
|
124 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 171 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 116 times.
|
120 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 172 |
5/6✗ Branch 0 not taken.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 112 times.
|
116 | CHECK_IMAGE_SIZE(width, height); |
| 173 | |||
| 174 | 112 | const std::array<uint8_t, 256> precalculated_table = | |
| 175 | 112 | kleidicv::neon::precalculate_scale_table_u8(scale, shift); | |
| 176 | 112 | return kleidicv_thread_unary_op_impl( | |
| 177 | 112 | kleidicv::neon::scale_with_precalculated_table_u8, mt, src, src_stride, | |
| 178 | 112 | dst, dst_stride, width, height, scale, shift, precalculated_table); | |
| 179 | 124 | } | |
| 180 | |||
| 181 | 120 | kleidicv_error_t kleidicv_thread_scale_f32(const float *src, size_t src_stride, | |
| 182 | float *dst, size_t dst_stride, | ||
| 183 | size_t width, size_t height, | ||
| 184 | double scale, double shift, | ||
| 185 | kleidicv_thread_multithreading mt) { | ||
| 186 | 240 | return kleidicv_thread_unary_op_impl(kleidicv_scale_f32, mt, src, src_stride, | |
| 187 | 120 | dst, dst_stride, width, height, scale, | |
| 188 | 120 | shift); | |
| 189 | } | ||
| 190 | |||
| 191 | 100 | kleidicv_error_t kleidicv_thread_scale_u8_f16( | |
| 192 | const uint8_t *src, size_t src_stride, float16_t *dst, size_t dst_stride, | ||
| 193 | size_t width, size_t height, double scale, double shift, | ||
| 194 | kleidicv_thread_multithreading mt) { | ||
| 195 | 200 | return kleidicv_thread_unary_op_impl(kleidicv_scale_u8_f16, mt, src, | |
| 196 | 100 | src_stride, dst, dst_stride, width, | |
| 197 | 100 | height, scale, shift); | |
| 198 | } | ||
| 199 | |||
| 200 | #define KLEIDICV_THREAD_BINARY_OP_IMPL(suffix, type) \ | ||
| 201 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
| 202 | const type *src_a, size_t src_a_stride, const type *src_b, \ | ||
| 203 | size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ | ||
| 204 | size_t height, kleidicv_thread_multithreading mt) { \ | ||
| 205 | return kleidicv_thread_binary_op_impl(kleidicv_##suffix, mt, src_a, \ | ||
| 206 | src_a_stride, src_b, src_b_stride, \ | ||
| 207 | dst, dst_stride, width, height); \ | ||
| 208 | } | ||
| 209 | |||
| 210 | #define KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(suffix, type, scaletype) \ | ||
| 211 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
| 212 | const type *src_a, size_t src_a_stride, const type *src_b, \ | ||
| 213 | size_t src_b_stride, type *dst, size_t dst_stride, size_t width, \ | ||
| 214 | size_t height, scaletype scale, kleidicv_thread_multithreading mt) { \ | ||
| 215 | return kleidicv_thread_binary_op_impl( \ | ||
| 216 | kleidicv_##suffix, mt, src_a, src_a_stride, src_b, src_b_stride, dst, \ | ||
| 217 | dst_stride, width, height, scale); \ | ||
| 218 | } | ||
| 219 | |||
| 220 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s8, int8_t); | |
| 221 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u8, uint8_t); | |
| 222 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s16, int16_t); | |
| 223 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u16, uint16_t); | |
| 224 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s32, int32_t); | |
| 225 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u32, uint32_t); | |
| 226 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_s64, int64_t); | |
| 227 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_add_u64, uint64_t); | |
| 228 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s8, int8_t); | |
| 229 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u8, uint8_t); | |
| 230 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s16, int16_t); | |
| 231 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u16, uint16_t); | |
| 232 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s32, int32_t); | |
| 233 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u32, uint32_t); | |
| 234 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_s64, int64_t); | |
| 235 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_sub_u64, uint64_t); | |
| 236 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u8, uint8_t); | |
| 237 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s8, int8_t); | |
| 238 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_u16, uint16_t); | |
| 239 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s16, int16_t); | |
| 240 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(saturating_absdiff_s32, int32_t); | |
| 241 | 100 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u8, uint8_t, double); | |
| 242 | 100 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s8, int8_t, double); | |
| 243 | 100 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_u16, uint16_t, double); | |
| 244 | 100 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s16, int16_t, double); | |
| 245 | 100 | KLEIDICV_THREAD_BINARY_OP_SCALE_IMPL(saturating_multiply_s32, int32_t, double); | |
| 246 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(bitwise_and, uint8_t); | |
| 247 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(compare_equal_u8, uint8_t); | |
| 248 | 100 | KLEIDICV_THREAD_BINARY_OP_IMPL(compare_greater_u8, uint8_t); | |
| 249 | |||
| 250 | 100 | kleidicv_error_t kleidicv_thread_saturating_add_abs_with_threshold_s16( | |
| 251 | const int16_t *src_a, size_t src_a_stride, const int16_t *src_b, | ||
| 252 | size_t src_b_stride, int16_t *dst, size_t dst_stride, size_t width, | ||
| 253 | size_t height, int16_t threshold, kleidicv_thread_multithreading mt) { | ||
| 254 | 100 | return kleidicv_thread_binary_op_impl( | |
| 255 | 100 | kleidicv_saturating_add_abs_with_threshold_s16, mt, src_a, src_a_stride, | |
| 256 | 100 | src_b, src_b_stride, dst, dst_stride, width, height, threshold); | |
| 257 | } | ||
| 258 | |||
| 259 | 172 | kleidicv_error_t kleidicv_thread_rotate(const void *src, size_t src_stride, | |
| 260 | size_t width, size_t height, void *dst, | ||
| 261 | size_t dst_stride, int angle, | ||
| 262 | size_t element_size, | ||
| 263 | kleidicv_thread_multithreading mt) { | ||
| 264 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 160 times.
|
172 | if (!kleidicv::rotate_is_implemented(src, dst, angle, element_size)) { |
| 265 | 12 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 266 | } | ||
| 267 | // reading in columns and writing out rows tends to perform better | ||
| 268 | 416 | auto callback = [=](unsigned begin, unsigned end) { | |
| 269 | 512 | return kleidicv_rotate( | |
| 270 | 256 | static_cast<const uint8_t *>(src) + begin * element_size, src_stride, | |
| 271 | 256 | end - begin, height, static_cast<uint8_t *>(dst) + begin * dst_stride, | |
| 272 | 256 | dst_stride, angle, element_size); | |
| 273 | }; | ||
| 274 | 160 | return parallel_batches(callback, mt, width, 64); | |
| 275 | 172 | } | |
| 276 | |||
| 277 | 135 | kleidicv_error_t kleidicv_thread_yuv_p_to_bgr_u8( | |
| 278 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 279 | size_t width, size_t height, bool is_yv12, | ||
| 280 | kleidicv_thread_multithreading mt) { | ||
| 281 | 275 | auto callback = [=](unsigned begin, unsigned end) { | |
| 282 | 280 | return kleidicv_yuv_p_to_bgr_stripe_u8( | |
| 283 | 140 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 284 | 140 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 285 | }; | ||
| 286 | 270 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 287 | 135 | } | |
| 288 | |||
| 289 | 135 | kleidicv_error_t kleidicv_thread_yuv_p_to_bgra_u8( | |
| 290 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 291 | size_t width, size_t height, bool is_yv12, | ||
| 292 | kleidicv_thread_multithreading mt) { | ||
| 293 | 275 | auto callback = [=](unsigned begin, unsigned end) { | |
| 294 | 280 | return kleidicv_yuv_p_to_bgra_stripe_u8( | |
| 295 | 140 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 296 | 140 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 297 | }; | ||
| 298 | 270 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 299 | 135 | } | |
| 300 | |||
| 301 | 135 | kleidicv_error_t kleidicv_thread_yuv_p_to_rgb_u8( | |
| 302 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 303 | size_t width, size_t height, bool is_yv12, | ||
| 304 | kleidicv_thread_multithreading mt) { | ||
| 305 | 275 | auto callback = [=](unsigned begin, unsigned end) { | |
| 306 | 280 | return kleidicv_yuv_p_to_rgb_stripe_u8( | |
| 307 | 140 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 308 | 140 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 309 | }; | ||
| 310 | 270 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 311 | 135 | } | |
| 312 | |||
| 313 | 135 | kleidicv_error_t kleidicv_thread_yuv_p_to_rgba_u8( | |
| 314 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 315 | size_t width, size_t height, bool is_yv12, | ||
| 316 | kleidicv_thread_multithreading mt) { | ||
| 317 | 275 | auto callback = [=](unsigned begin, unsigned end) { | |
| 318 | 280 | return kleidicv_yuv_p_to_rgba_stripe_u8( | |
| 319 | 140 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 320 | 140 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 321 | }; | ||
| 322 | 270 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 323 | 135 | } | |
| 324 | |||
| 325 | 70 | kleidicv_error_t kleidicv_thread_rgb_to_yuv420_p_u8( | |
| 326 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 327 | size_t width, size_t height, bool is_yv12, | ||
| 328 | kleidicv_thread_multithreading mt) { | ||
| 329 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 330 | 150 | return kleidicv_rgb_to_yuv420_p_stripe_u8( | |
| 331 | 75 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 332 | 75 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 333 | }; | ||
| 334 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 335 | 70 | } | |
| 336 | |||
| 337 | 70 | kleidicv_error_t kleidicv_thread_rgba_to_yuv420_p_u8( | |
| 338 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 339 | size_t width, size_t height, bool is_yv12, | ||
| 340 | kleidicv_thread_multithreading mt) { | ||
| 341 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 342 | 150 | return kleidicv_rgba_to_yuv420_p_stripe_u8( | |
| 343 | 75 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 344 | 75 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 345 | }; | ||
| 346 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 347 | 70 | } | |
| 348 | |||
| 349 | 70 | kleidicv_error_t kleidicv_thread_bgr_to_yuv420_p_u8( | |
| 350 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 351 | size_t width, size_t height, bool is_yv12, | ||
| 352 | kleidicv_thread_multithreading mt) { | ||
| 353 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 354 | 150 | return kleidicv_bgr_to_yuv420_p_stripe_u8( | |
| 355 | 75 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 356 | 75 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 357 | }; | ||
| 358 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 359 | 70 | } | |
| 360 | |||
| 361 | 70 | kleidicv_error_t kleidicv_thread_bgra_to_yuv420_p_u8( | |
| 362 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 363 | size_t width, size_t height, bool is_yv12, | ||
| 364 | kleidicv_thread_multithreading mt) { | ||
| 365 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 366 | 150 | return kleidicv_bgra_to_yuv420_p_stripe_u8( | |
| 367 | 75 | src, src_stride, dst, dst_stride, width, height, is_yv12, | |
| 368 | 75 | static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 369 | }; | ||
| 370 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 371 | 70 | } | |
| 372 | |||
| 373 | 70 | kleidicv_error_t kleidicv_thread_rgb_to_yuv420_sp_u8( | |
| 374 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 375 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 376 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
| 377 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 378 | 150 | return kleidicv_rgb_to_yuv420_sp_stripe_u8( | |
| 379 | 75 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
| 380 | 75 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 381 | }; | ||
| 382 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 383 | 70 | } | |
| 384 | |||
| 385 | 70 | kleidicv_error_t kleidicv_thread_rgba_to_yuv420_sp_u8( | |
| 386 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 387 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 388 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
| 389 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 390 | 150 | return kleidicv_rgba_to_yuv420_sp_stripe_u8( | |
| 391 | 75 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
| 392 | 75 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 393 | }; | ||
| 394 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 395 | 70 | } | |
| 396 | |||
| 397 | 70 | kleidicv_error_t kleidicv_thread_bgr_to_yuv420_sp_u8( | |
| 398 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 399 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 400 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
| 401 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 402 | 150 | return kleidicv_bgr_to_yuv420_sp_stripe_u8( | |
| 403 | 75 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
| 404 | 75 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 405 | }; | ||
| 406 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 407 | 70 | } | |
| 408 | |||
| 409 | 70 | kleidicv_error_t kleidicv_thread_bgra_to_yuv420_sp_u8( | |
| 410 | const uint8_t *src, size_t src_stride, uint8_t *y_dst, size_t y_stride, | ||
| 411 | uint8_t *uv_dst, size_t uv_stride, size_t width, size_t height, | ||
| 412 | bool is_nv21, kleidicv_thread_multithreading mt) { | ||
| 413 | 145 | auto callback = [=](unsigned begin, unsigned end) { | |
| 414 | 150 | return kleidicv_bgra_to_yuv420_sp_stripe_u8( | |
| 415 | 75 | src, src_stride, y_dst, y_stride, uv_dst, uv_stride, width, height, | |
| 416 | 75 | is_nv21, static_cast<size_t>(begin), static_cast<size_t>(end)); | |
| 417 | }; | ||
| 418 | 140 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 419 | 70 | } | |
| 420 | |||
| 421 | template <typename F> | ||
| 422 | 760 | inline kleidicv_error_t kleidicv_thread_yuv_sp_to_rgb_u8_impl( | |
| 423 | F f, const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, | ||
| 424 | size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, | ||
| 425 | size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { | ||
| 426 | 1520 | auto callback = [=](unsigned begin, unsigned end) { | |
| 427 | 760 | size_t row_begin = size_t{begin} * 2; | |
| 428 | 760 | size_t row_end = std::min<size_t>(height, size_t{end} * 2); | |
| 429 | 760 | size_t row_uv = begin; | |
| 430 | 2280 | return f(src_y + row_begin * src_y_stride, src_y_stride, | |
| 431 | 760 | src_uv + row_uv * src_uv_stride, src_uv_stride, | |
| 432 | 760 | dst + row_begin * dst_stride, dst_stride, width, | |
| 433 | 760 | row_end - row_begin, is_nv21); | |
| 434 | 760 | }; | |
| 435 | 1520 | return parallel_batches(callback, mt, (height + 1) / 2); | |
| 436 | 760 | } | |
| 437 | |||
| 438 | #define YUV_SP_TO_RGB(suffix) \ | ||
| 439 | kleidicv_error_t kleidicv_thread_##suffix( \ | ||
| 440 | const uint8_t *src_y, size_t src_y_stride, const uint8_t *src_uv, \ | ||
| 441 | size_t src_uv_stride, uint8_t *dst, size_t dst_stride, size_t width, \ | ||
| 442 | size_t height, bool is_nv21, kleidicv_thread_multithreading mt) { \ | ||
| 443 | return kleidicv_thread_yuv_sp_to_rgb_u8_impl( \ | ||
| 444 | kleidicv_##suffix, src_y, src_y_stride, src_uv, src_uv_stride, dst, \ | ||
| 445 | dst_stride, width, height, is_nv21, mt); \ | ||
| 446 | } | ||
| 447 | |||
| 448 | 190 | YUV_SP_TO_RGB(yuv_sp_to_bgr_u8); | |
| 449 | 190 | YUV_SP_TO_RGB(yuv_sp_to_bgra_u8); | |
| 450 | 190 | YUV_SP_TO_RGB(yuv_sp_to_rgb_u8); | |
| 451 | 190 | YUV_SP_TO_RGB(yuv_sp_to_rgba_u8); | |
| 452 | |||
| 453 | template <typename ScalarType, typename FunctionType> | ||
| 454 | 368 | kleidicv_error_t parallel_min_max(FunctionType min_max_func, | |
| 455 | const ScalarType *src, size_t src_stride, | ||
| 456 | size_t width, size_t height, | ||
| 457 | ScalarType *p_min_value, | ||
| 458 | ScalarType *p_max_value, | ||
| 459 | kleidicv_thread_multithreading mt) { | ||
| 460 | 736 | std::vector<ScalarType> min_values(height, | |
| 461 | 368 | std::numeric_limits<ScalarType>::max()); | |
| 462 | 736 | std::vector<ScalarType> max_values(height, | |
| 463 | 368 | std::numeric_limits<ScalarType>::lowest()); | |
| 464 | |||
| 465 | 760 | auto callback = [&](unsigned begin, unsigned end) { | |
| 466 | 784 | return min_max_func(src + begin * (src_stride / sizeof(ScalarType)), | |
| 467 | 392 | src_stride, width, end - begin, | |
| 468 |
12/12✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 56 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 56 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 64 times.
✓ Branch 11 taken 8 times.
|
392 | p_min_value ? min_values.data() + begin : nullptr, |
| 469 |
12/12✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 56 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 56 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 64 times.
✓ Branch 11 taken 8 times.
|
392 | p_max_value ? max_values.data() + begin : nullptr); |
| 470 | }; | ||
| 471 | |||
| 472 | 368 | auto return_val = parallel_batches(callback, mt, height); | |
| 473 | |||
| 474 |
12/12✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
|
368 | if (p_min_value) { |
| 475 | 320 | *p_min_value = std::numeric_limits<ScalarType>::max(); | |
| 476 |
12/12✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
|
2128 | for (ScalarType m : min_values) { |
| 477 |
12/12✓ Branch 0 taken 235 times.
✓ Branch 1 taken 53 times.
✓ Branch 2 taken 235 times.
✓ Branch 3 taken 53 times.
✓ Branch 4 taken 233 times.
✓ Branch 5 taken 55 times.
✓ Branch 6 taken 233 times.
✓ Branch 7 taken 55 times.
✓ Branch 8 taken 233 times.
✓ Branch 9 taken 55 times.
✓ Branch 10 taken 306 times.
✓ Branch 11 taken 62 times.
|
1808 | if (m < *p_min_value) { |
| 478 | 333 | *p_min_value = m; | |
| 479 | 333 | } | |
| 480 | 1808 | } | |
| 481 | 320 | } | |
| 482 |
12/12✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 8 times.
✓ Branch 8 taken 52 times.
✓ Branch 9 taken 8 times.
✓ Branch 10 taken 60 times.
✓ Branch 11 taken 8 times.
|
368 | if (p_max_value) { |
| 483 | 320 | *p_max_value = std::numeric_limits<ScalarType>::lowest(); | |
| 484 |
12/12✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
✓ Branch 2 taken 288 times.
✓ Branch 3 taken 52 times.
✓ Branch 4 taken 288 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 288 times.
✓ Branch 7 taken 52 times.
✓ Branch 8 taken 288 times.
✓ Branch 9 taken 52 times.
✓ Branch 10 taken 368 times.
✓ Branch 11 taken 60 times.
|
2128 | for (ScalarType m : max_values) { |
| 485 |
12/12✓ Branch 0 taken 234 times.
✓ Branch 1 taken 54 times.
✓ Branch 2 taken 234 times.
✓ Branch 3 taken 54 times.
✓ Branch 4 taken 232 times.
✓ Branch 5 taken 56 times.
✓ Branch 6 taken 232 times.
✓ Branch 7 taken 56 times.
✓ Branch 8 taken 232 times.
✓ Branch 9 taken 56 times.
✓ Branch 10 taken 307 times.
✓ Branch 11 taken 61 times.
|
1808 | if (m > *p_max_value) { |
| 486 | 337 | *p_max_value = m; | |
| 487 | 337 | } | |
| 488 | 1808 | } | |
| 489 | 320 | } | |
| 490 | 368 | return return_val; | |
| 491 | 368 | } | |
| 492 | |||
| 493 | #define DEFINE_KLEIDICV_THREAD_MIN_MAX(suffix, type) \ | ||
| 494 | kleidicv_error_t kleidicv_thread_min_max_##suffix( \ | ||
| 495 | const type *src, size_t src_stride, size_t width, size_t height, \ | ||
| 496 | type *p_min_value, type *p_max_value, \ | ||
| 497 | kleidicv_thread_multithreading mt) { \ | ||
| 498 | return parallel_min_max(kleidicv_min_max_##suffix, src, src_stride, width, \ | ||
| 499 | height, p_min_value, p_max_value, mt); \ | ||
| 500 | } | ||
| 501 | |||
| 502 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX(u8, uint8_t); | |
| 503 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s8, int8_t); | |
| 504 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX(u16, uint16_t); | |
| 505 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s16, int16_t); | |
| 506 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX(s32, int32_t); | |
| 507 | 68 | DEFINE_KLEIDICV_THREAD_MIN_MAX(f32, float); | |
| 508 | |||
| 509 | template <typename ScalarType, typename FunctionType> | ||
| 510 | 60 | kleidicv_error_t parallel_min_max_loc(FunctionType min_max_loc_func, | |
| 511 | const ScalarType *src, size_t src_stride, | ||
| 512 | size_t width, size_t height, | ||
| 513 | size_t *p_min_offset, | ||
| 514 | size_t *p_max_offset, | ||
| 515 | kleidicv_thread_multithreading mt) { | ||
| 516 | 60 | std::vector<size_t> min_offsets(height, 0); | |
| 517 | 60 | std::vector<size_t> max_offsets(height, 0); | |
| 518 | |||
| 519 | 124 | auto callback = [&](unsigned begin, unsigned end) { | |
| 520 | 128 | return min_max_loc_func( | |
| 521 | 64 | src + begin * (src_stride / sizeof(ScalarType)), src_stride, width, | |
| 522 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
|
64 | end - begin, p_min_offset ? min_offsets.data() + begin : nullptr, |
| 523 |
2/2✓ Branch 0 taken 56 times.
✓ Branch 1 taken 8 times.
|
64 | p_max_offset ? max_offsets.data() + begin : nullptr); |
| 524 | }; | ||
| 525 | 60 | auto return_val = parallel_batches(callback, mt, height); | |
| 526 | |||
| 527 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
|
60 | if (p_min_offset) { |
| 528 | 52 | *p_min_offset = 0; | |
| 529 |
2/2✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
|
340 | for (size_t i = 0; i < min_offsets.size(); ++i) { |
| 530 | 288 | size_t offs = min_offsets[i] + i * src_stride; | |
| 531 |
4/4✓ Branch 0 taken 246 times.
✓ Branch 1 taken 42 times.
✓ Branch 2 taken 246 times.
✓ Branch 3 taken 42 times.
|
576 | if (src[offs / sizeof(ScalarType)] < |
| 532 | 288 | src[*p_min_offset / sizeof(ScalarType)]) { | |
| 533 | 42 | *p_min_offset = offs; | |
| 534 | 42 | } | |
| 535 | 288 | } | |
| 536 | 52 | } | |
| 537 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 8 times.
|
60 | if (p_max_offset) { |
| 538 | 52 | *p_max_offset = 0; | |
| 539 |
2/2✓ Branch 0 taken 288 times.
✓ Branch 1 taken 52 times.
|
340 | for (size_t i = 0; i < max_offsets.size(); ++i) { |
| 540 | 288 | size_t offs = max_offsets[i] + i * src_stride; | |
| 541 |
4/4✓ Branch 0 taken 255 times.
✓ Branch 1 taken 33 times.
✓ Branch 2 taken 255 times.
✓ Branch 3 taken 33 times.
|
576 | if (src[offs / sizeof(ScalarType)] > |
| 542 | 288 | src[*p_max_offset / sizeof(ScalarType)]) { | |
| 543 | 33 | *p_max_offset = offs; | |
| 544 | 33 | } | |
| 545 | 288 | } | |
| 546 | 52 | } | |
| 547 | 60 | return return_val; | |
| 548 | 60 | } | |
| 549 | |||
| 550 | #define DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(suffix, type) \ | ||
| 551 | kleidicv_error_t kleidicv_thread_min_max_loc_##suffix( \ | ||
| 552 | const type *src, size_t src_stride, size_t width, size_t height, \ | ||
| 553 | size_t *p_min_offset, size_t *p_max_offset, \ | ||
| 554 | kleidicv_thread_multithreading mt) { \ | ||
| 555 | return parallel_min_max_loc(kleidicv_min_max_loc_##suffix, src, \ | ||
| 556 | src_stride, width, height, p_min_offset, \ | ||
| 557 | p_max_offset, mt); \ | ||
| 558 | } | ||
| 559 | |||
| 560 | 60 | DEFINE_KLEIDICV_THREAD_MIN_MAX_LOC(u8, uint8_t); | |
| 561 | |||
| 562 | template <typename F> | ||
| 563 | 144 | kleidicv_error_t kleidicv_thread_filter(F filter, size_t width, size_t height, | |
| 564 | size_t channels, size_t kernel_width, | ||
| 565 | size_t kernel_height, | ||
| 566 | kleidicv_filter_context_t *context, | ||
| 567 | kleidicv_thread_multithreading mt) { | ||
| 568 | 372 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 569 | // The context contains a buffer that can only fit a single row, so can't be | ||
| 570 | // shared between threads. Since we don't know how many threads there are, | ||
| 571 | // create and destroy a context every time this callback is called. Only use | ||
| 572 | // the context argument for the first thread. | ||
| 573 | 228 | bool create_context = 0 != y_begin; | |
| 574 | 228 | kleidicv_filter_context_t *thread_context = context; | |
| 575 |
12/12✓ Branch 0 taken 28 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 28 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 16 times.
|
228 | if (create_context) { |
| 576 | 168 | kleidicv_error_t context_create_result = kleidicv_filter_context_create( | |
| 577 | 84 | &thread_context, channels, kernel_width, kernel_height, width, | |
| 578 | 84 | height); | |
| 579 | // Excluded from coverage because it's impractical to test this. | ||
| 580 | // MockMallocToFail can't be used because malloc is used in thread setup. | ||
| 581 | // GCOVR_EXCL_START | ||
| 582 | − | if (KLEIDICV_OK != context_create_result) { | |
| 583 | − | return context_create_result; | |
| 584 | } | ||
| 585 | // GCOVR_EXCL_STOP | ||
| 586 | 84 | } | |
| 587 | |||
| 588 | 228 | kleidicv_error_t result = filter(y_begin, y_end, thread_context); | |
| 589 | |||
| 590 |
12/12✓ Branch 0 taken 28 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 4 times.
✓ Branch 4 taken 28 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 28 times.
✓ Branch 7 taken 16 times.
✓ Branch 8 taken 28 times.
✓ Branch 9 taken 16 times.
✓ Branch 10 taken 28 times.
✓ Branch 11 taken 16 times.
|
228 | if (create_context) { |
| 591 | 168 | kleidicv_error_t context_release_result = | |
| 592 | 84 | kleidicv_filter_context_release(thread_context); | |
| 593 |
6/12✗ Branch 0 not taken.
✓ Branch 1 taken 16 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 4 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 16 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 16 times.
✗ Branch 8 not taken.
✓ Branch 9 taken 16 times.
✗ Branch 10 not taken.
✓ Branch 11 taken 16 times.
|
84 | if (KLEIDICV_OK == result) { |
| 594 | 84 | result = context_release_result; | |
| 595 | 84 | } | |
| 596 | 84 | } | |
| 597 | 228 | return result; | |
| 598 | 228 | }; | |
| 599 | 288 | return parallel_batches(callback, mt, height); | |
| 600 | 144 | } | |
| 601 | |||
| 602 | 208 | kleidicv_error_t kleidicv_thread_gaussian_blur_u8( | |
| 603 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 604 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
| 605 | size_t kernel_height, float sigma_x, float sigma_y, | ||
| 606 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
| 607 | kleidicv_thread_multithreading mt) { | ||
| 608 | 208 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
| 609 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 204 times.
|
208 | if (!fixed_border_type) { |
| 610 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 611 | } | ||
| 612 | |||
| 613 |
4/4✓ Branch 0 taken 32 times.
✓ Branch 1 taken 172 times.
✓ Branch 2 taken 32 times.
✓ Branch 3 taken 172 times.
|
408 | if (!kleidicv::gaussian_blur_is_implemented(width, height, kernel_width, |
| 614 | 204 | kernel_height, sigma_x, sigma_y, | |
| 615 | 204 | channels, *fixed_border_type)) { | |
| 616 | 172 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 617 | } | ||
| 618 | |||
| 619 |
4/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 28 times.
✓ Branch 2 taken 4 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 4 times.
|
32 | if (kernel_width <= 7 || kernel_width == 15 || kernel_width == 21) { |
| 620 | 72 | auto callback = [=](size_t y_begin, size_t y_end, | |
| 621 | kleidicv_filter_context_t *thread_context) { | ||
| 622 | 88 | return kleidicv_gaussian_blur_fixed_stripe_u8( | |
| 623 | 44 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 624 | 44 | channels, kernel_width, kernel_height, sigma_x, sigma_y, | |
| 625 | 44 | *fixed_border_type, thread_context); | |
| 626 | }; | ||
| 627 | 56 | return kleidicv_thread_filter(callback, width, height, channels, | |
| 628 | 28 | kernel_width, kernel_height, context, mt); | |
| 629 | 28 | } | |
| 630 | |||
| 631 | 12 | auto callback = [=](size_t y_begin, size_t y_end, | |
| 632 | kleidicv_filter_context_t *thread_context) { | ||
| 633 | 16 | return kleidicv_gaussian_blur_arbitrary_stripe_u8( | |
| 634 | 8 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 635 | 8 | channels, kernel_width, kernel_height, sigma_x, sigma_y, | |
| 636 | 8 | *fixed_border_type, thread_context); | |
| 637 | }; | ||
| 638 | 8 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
| 639 | 4 | kernel_height, context, mt); | |
| 640 | 208 | } | |
| 641 | |||
| 642 | 108 | kleidicv_error_t kleidicv_thread_separable_filter_2d_u8( | |
| 643 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 644 | size_t width, size_t height, size_t channels, const uint8_t *kernel_x, | ||
| 645 | size_t kernel_width, const uint8_t *kernel_y, size_t kernel_height, | ||
| 646 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
| 647 | kleidicv_thread_multithreading mt) { | ||
| 648 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
|
216 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
| 649 | 108 | kernel_height)) { | |
| 650 | 76 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 651 | } | ||
| 652 | |||
| 653 | 32 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
| 654 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
|
32 | if (!fixed_border_type) { |
| 655 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 656 | } | ||
| 657 | |||
| 658 | 72 | auto callback = [=](size_t y_begin, size_t y_end, | |
| 659 | kleidicv_filter_context_t *thread_context) { | ||
| 660 | 88 | return kleidicv_separable_filter_2d_stripe_u8( | |
| 661 | 44 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 662 | 44 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
| 663 | 44 | *fixed_border_type, thread_context); | |
| 664 | }; | ||
| 665 | 56 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
| 666 | 28 | kernel_height, context, mt); | |
| 667 | 108 | } | |
| 668 | |||
| 669 | 108 | kleidicv_error_t kleidicv_thread_separable_filter_2d_u16( | |
| 670 | const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, | ||
| 671 | size_t width, size_t height, size_t channels, const uint16_t *kernel_x, | ||
| 672 | size_t kernel_width, const uint16_t *kernel_y, size_t kernel_height, | ||
| 673 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
| 674 | kleidicv_thread_multithreading mt) { | ||
| 675 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
|
216 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
| 676 | 108 | kernel_height)) { | |
| 677 | 76 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 678 | } | ||
| 679 | |||
| 680 | 32 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
| 681 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
|
32 | if (!fixed_border_type) { |
| 682 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 683 | } | ||
| 684 | |||
| 685 | 72 | auto callback = [=](size_t y_begin, size_t y_end, | |
| 686 | kleidicv_filter_context_t *thread_context) { | ||
| 687 | 88 | return kleidicv_separable_filter_2d_stripe_u16( | |
| 688 | 44 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 689 | 44 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
| 690 | 44 | *fixed_border_type, thread_context); | |
| 691 | }; | ||
| 692 | 56 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
| 693 | 28 | kernel_height, context, mt); | |
| 694 | 108 | } | |
| 695 | |||
| 696 | 108 | kleidicv_error_t kleidicv_thread_separable_filter_2d_s16( | |
| 697 | const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
| 698 | size_t width, size_t height, size_t channels, const int16_t *kernel_x, | ||
| 699 | size_t kernel_width, const int16_t *kernel_y, size_t kernel_height, | ||
| 700 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
| 701 | kleidicv_thread_multithreading mt) { | ||
| 702 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
|
216 | if (!kleidicv::separable_filter_2d_is_implemented(width, height, kernel_width, |
| 703 | 108 | kernel_height)) { | |
| 704 | 76 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 705 | } | ||
| 706 | |||
| 707 | 32 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
| 708 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
|
32 | if (!fixed_border_type) { |
| 709 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 710 | } | ||
| 711 | |||
| 712 | 72 | auto callback = [=](size_t y_begin, size_t y_end, | |
| 713 | kleidicv_filter_context_t *thread_context) { | ||
| 714 | 88 | return kleidicv_separable_filter_2d_stripe_s16( | |
| 715 | 44 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 716 | 44 | channels, kernel_x, kernel_width, kernel_y, kernel_height, | |
| 717 | 44 | *fixed_border_type, thread_context); | |
| 718 | }; | ||
| 719 | 56 | return kleidicv_thread_filter(callback, width, height, channels, kernel_width, | |
| 720 | 28 | kernel_height, context, mt); | |
| 721 | 108 | } | |
| 722 | |||
| 723 | 108 | kleidicv_error_t kleidicv_thread_blur_and_downsample_u8( | |
| 724 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 725 | uint8_t *dst, size_t dst_stride, size_t channels, | ||
| 726 | kleidicv_border_type_t border_type, kleidicv_filter_context_t *context, | ||
| 727 | kleidicv_thread_multithreading mt) { | ||
| 728 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 32 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 32 times.
|
216 | if (!kleidicv::blur_and_downsample_is_implemented(src_width, src_height, |
| 729 | 108 | channels)) { | |
| 730 | 76 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 731 | } | ||
| 732 | |||
| 733 | 32 | auto fixed_border_type = kleidicv::get_fixed_border_type(border_type); | |
| 734 |
2/2✓ Branch 0 taken 28 times.
✓ Branch 1 taken 4 times.
|
32 | if (!fixed_border_type) { |
| 735 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 736 | } | ||
| 737 | |||
| 738 | 72 | auto callback = [=](unsigned y_begin, unsigned y_end, | |
| 739 | kleidicv_filter_context_t *thread_context) { | ||
| 740 | 88 | return kleidicv_blur_and_downsample_stripe_u8( | |
| 741 | 44 | src, src_stride, src_width, src_height, dst, dst_stride, y_begin, y_end, | |
| 742 | 44 | channels, *fixed_border_type, thread_context); | |
| 743 | }; | ||
| 744 | 56 | return kleidicv_thread_filter(callback, src_width, src_height, channels, 5, 5, | |
| 745 | 28 | context, mt); | |
| 746 | 108 | } | |
| 747 | |||
| 748 | 204 | kleidicv_error_t kleidicv_thread_sobel_3x3_horizontal_s16_u8( | |
| 749 | const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
| 750 | size_t width, size_t height, size_t channels, | ||
| 751 | kleidicv_thread_multithreading mt) { | ||
| 752 |
2/2✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
|
204 | if (!kleidicv::sobel_is_implemented(width, height, 3)) { |
| 753 | 92 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 754 | } | ||
| 755 | |||
| 756 | 272 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 757 | 320 | return kleidicv_sobel_3x3_horizontal_stripe_s16_u8( | |
| 758 | 160 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 759 | 160 | channels); | |
| 760 | }; | ||
| 761 | 112 | return parallel_batches(callback, mt, height); | |
| 762 | 204 | } | |
| 763 | |||
| 764 | 532 | kleidicv_error_t kleidicv_thread_median_blur_u8( | |
| 765 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
| 766 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
| 767 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
| 768 | kleidicv_thread_multithreading mt) { | ||
| 769 | 1064 | auto result_pair = kleidicv::median_blur_is_implemented( | |
| 770 | 532 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
| 771 | 532 | kernel_height, border_type); | |
| 772 | |||
| 773 | 532 | auto checks_result = result_pair.first; | |
| 774 | 532 | auto fixed_border_type = result_pair.second; | |
| 775 |
2/2✓ Branch 0 taken 416 times.
✓ Branch 1 taken 116 times.
|
532 | if (checks_result != KLEIDICV_OK) { |
| 776 | 416 | return checks_result; | |
| 777 | } | ||
| 778 | |||
| 779 |
2/2✓ Branch 0 taken 100 times.
✓ Branch 1 taken 16 times.
|
116 | if (kernel_width <= 7) { |
| 780 | 252 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 781 | 304 | return kleidicv_median_blur_sorting_network_stripe_u8( | |
| 782 | 152 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 783 | 152 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 784 | }; | ||
| 785 | 100 | return parallel_batches(callback, mt, height); | |
| 786 | 100 | } | |
| 787 | |||
| 788 |
3/4✓ Branch 0 taken 16 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 12 times.
|
16 | if (kernel_width > 7 && kernel_width <= 15) { |
| 789 | 36 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 790 | 48 | return kleidicv_median_blur_small_hist_stripe_u8( | |
| 791 | 24 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 792 | 24 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 793 | }; | ||
| 794 | 12 | return parallel_batches(callback, mt, height); | |
| 795 | 12 | } | |
| 796 | |||
| 797 | 12 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 798 | 16 | return kleidicv_median_blur_large_hist_stripe_u8( | |
| 799 | 8 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 800 | 8 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 801 | }; | ||
| 802 | 4 | return parallel_batches(callback, mt, height); | |
| 803 | 532 | } | |
| 804 | |||
| 805 | 532 | kleidicv_error_t kleidicv_thread_median_blur_s16( | |
| 806 | const int16_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
| 807 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
| 808 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
| 809 | kleidicv_thread_multithreading mt) { | ||
| 810 | 1064 | auto result_pair = kleidicv::median_blur_is_implemented( | |
| 811 | 532 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
| 812 | 532 | kernel_height, border_type); | |
| 813 | |||
| 814 | 532 | auto checks_result = result_pair.first; | |
| 815 | 532 | auto fixed_border_type = result_pair.second; | |
| 816 |
2/2✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
|
532 | if (checks_result != KLEIDICV_OK) { |
| 817 | 432 | return checks_result; | |
| 818 | } | ||
| 819 | |||
| 820 | 252 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 821 | 304 | return kleidicv_median_blur_sorting_network_stripe_s16( | |
| 822 | 152 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 823 | 152 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 824 | }; | ||
| 825 | 100 | return parallel_batches(callback, mt, height); | |
| 826 | 532 | } | |
| 827 | |||
| 828 | 532 | kleidicv_error_t kleidicv_thread_median_blur_u16( | |
| 829 | const uint16_t *src, size_t src_stride, uint16_t *dst, size_t dst_stride, | ||
| 830 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
| 831 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
| 832 | kleidicv_thread_multithreading mt) { | ||
| 833 | 1064 | auto result_pair = kleidicv::median_blur_is_implemented( | |
| 834 | 532 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
| 835 | 532 | kernel_height, border_type); | |
| 836 | |||
| 837 | 532 | auto checks_result = result_pair.first; | |
| 838 | 532 | auto fixed_border_type = result_pair.second; | |
| 839 |
2/2✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
|
532 | if (checks_result != KLEIDICV_OK) { |
| 840 | 432 | return checks_result; | |
| 841 | } | ||
| 842 | |||
| 843 | 252 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 844 | 304 | return kleidicv_median_blur_sorting_network_stripe_u16( | |
| 845 | 152 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 846 | 152 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 847 | }; | ||
| 848 | 100 | return parallel_batches(callback, mt, height); | |
| 849 | 532 | } | |
| 850 | |||
| 851 | 532 | kleidicv_error_t kleidicv_thread_median_blur_f32( | |
| 852 | const float *src, size_t src_stride, float *dst, size_t dst_stride, | ||
| 853 | size_t width, size_t height, size_t channels, size_t kernel_width, | ||
| 854 | size_t kernel_height, kleidicv_border_type_t border_type, | ||
| 855 | kleidicv_thread_multithreading mt) { | ||
| 856 | 1064 | auto result_pair = kleidicv::median_blur_is_implemented( | |
| 857 | 532 | src, src_stride, dst, dst_stride, width, height, channels, kernel_width, | |
| 858 | 532 | kernel_height, border_type); | |
| 859 | |||
| 860 | 532 | auto checks_result = result_pair.first; | |
| 861 | 532 | auto fixed_border_type = result_pair.second; | |
| 862 |
2/2✓ Branch 0 taken 432 times.
✓ Branch 1 taken 100 times.
|
532 | if (checks_result != KLEIDICV_OK) { |
| 863 | 432 | return checks_result; | |
| 864 | } | ||
| 865 | |||
| 866 | 252 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 867 | 304 | return kleidicv_median_blur_sorting_network_stripe_f32( | |
| 868 | 152 | src, src_stride, dst, dst_stride, width, height, y_begin, y_end, | |
| 869 | 152 | channels, kernel_width, kernel_height, fixed_border_type); | |
| 870 | }; | ||
| 871 | 100 | return parallel_batches(callback, mt, height); | |
| 872 | 532 | } | |
| 873 | |||
| 874 | 204 | kleidicv_error_t kleidicv_thread_sobel_3x3_vertical_s16_u8( | |
| 875 | const uint8_t *src, size_t src_stride, int16_t *dst, size_t dst_stride, | ||
| 876 | size_t width, size_t height, size_t channels, | ||
| 877 | kleidicv_thread_multithreading mt) { | ||
| 878 |
2/2✓ Branch 0 taken 92 times.
✓ Branch 1 taken 112 times.
|
204 | if (!kleidicv::sobel_is_implemented(width, height, 3)) { |
| 879 | 92 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 880 | } | ||
| 881 | |||
| 882 | 272 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 883 | 320 | return kleidicv_sobel_3x3_vertical_stripe_s16_u8(src, src_stride, dst, | |
| 884 | 160 | dst_stride, width, height, | |
| 885 | 160 | y_begin, y_end, channels); | |
| 886 | }; | ||
| 887 | 112 | return parallel_batches(callback, mt, height); | |
| 888 | 204 | } | |
| 889 | |||
| 890 | 104 | kleidicv_error_t kleidicv_thread_scharr_interleaved_s16_u8( | |
| 891 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 892 | size_t src_channels, int16_t *dst, size_t dst_stride, | ||
| 893 | kleidicv_thread_multithreading mt) { | ||
| 894 |
4/4✓ Branch 0 taken 4 times.
✓ Branch 1 taken 100 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 100 times.
|
208 | if (!kleidicv::scharr_interleaved_is_implemented(src_width, src_height, |
| 895 | 104 | src_channels)) { | |
| 896 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 897 | } | ||
| 898 | |||
| 899 | 244 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 900 | 288 | return kleidicv_scharr_interleaved_stripe_s16_u8( | |
| 901 | 144 | src, src_stride, src_width, src_height, src_channels, dst, dst_stride, | |
| 902 | 144 | y_begin, y_end); | |
| 903 | }; | ||
| 904 | |||
| 905 | // height is decremented by 2 as the result has less rows. | ||
| 906 | 100 | return parallel_batches(callback, mt, src_height - 2); | |
| 907 | 104 | } | |
| 908 | |||
| 909 | 120 | kleidicv_error_t kleidicv_thread_resize_to_quarter_u8( | |
| 910 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 911 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 912 | kleidicv_thread_multithreading mt) { | ||
| 913 | 240 | auto callback = [=](unsigned begin, unsigned end) { | |
| 914 | 120 | size_t src_begin = size_t{begin} * 2; | |
| 915 | 120 | size_t src_end = std::min<size_t>(src_height, size_t{end} * 2); | |
| 916 | 120 | size_t dst_begin = begin; | |
| 917 | 120 | size_t dst_end = std::min<size_t>(dst_height, end); | |
| 918 | |||
| 919 | // half of odd height is rounded towards zero? | ||
| 920 |
2/2✓ Branch 0 taken 10 times.
✓ Branch 1 taken 110 times.
|
120 | if (dst_begin == dst_end) { |
| 921 | 10 | return KLEIDICV_OK; | |
| 922 | } | ||
| 923 | |||
| 924 | 220 | return kleidicv_resize_to_quarter_u8( | |
| 925 | 110 | src + src_begin * src_stride, src_stride, src_width, | |
| 926 | 110 | src_end - src_begin, dst + dst_begin * dst_stride, dst_stride, | |
| 927 | 110 | dst_width, dst_end - dst_begin); | |
| 928 | 120 | }; | |
| 929 | 240 | return parallel_batches(callback, mt, (src_height + 1) / 2); | |
| 930 | 120 | } | |
| 931 | |||
| 932 | 125 | kleidicv_error_t kleidicv_thread_resize_linear_u8( | |
| 933 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 934 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 935 | kleidicv_thread_multithreading mt) { | ||
| 936 |
4/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 120 times.
|
250 | if (!kleidicv::resize_linear_u8_is_implemented(src_width, src_height, |
| 937 | 125 | dst_width, dst_height)) { | |
| 938 | 5 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 939 | } | ||
| 940 | 250 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 941 | 260 | return kleidicv_resize_linear_stripe_u8( | |
| 942 | 130 | src, src_stride, src_width, src_height, y_begin, | |
| 943 | 130 | std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width, | |
| 944 | 130 | dst_height); | |
| 945 | }; | ||
| 946 | 120 | return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1)); | |
| 947 | 125 | } | |
| 948 | |||
| 949 | 185 | kleidicv_error_t kleidicv_thread_resize_linear_f32( | |
| 950 | const float *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 951 | float *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 952 | kleidicv_thread_multithreading mt) { | ||
| 953 |
4/4✓ Branch 0 taken 5 times.
✓ Branch 1 taken 180 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 180 times.
|
370 | if (!kleidicv::resize_linear_f32_is_implemented(src_width, src_height, |
| 954 | 185 | dst_width, dst_height)) { | |
| 955 | 5 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 956 | } | ||
| 957 | 375 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 958 | 390 | return kleidicv_resize_linear_stripe_f32( | |
| 959 | 195 | src, src_stride, src_width, src_height, y_begin, | |
| 960 | 195 | std::min<size_t>(src_height, y_end + 1), dst, dst_stride, dst_width, | |
| 961 | 195 | dst_height); | |
| 962 | }; | ||
| 963 | 180 | return parallel_batches(callback, mt, std::max<size_t>(1, src_height - 1)); | |
| 964 | 185 | } | |
| 965 | |||
| 966 | 208 | kleidicv_error_t kleidicv_thread_remap_s16_u8( | |
| 967 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 968 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 969 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
| 970 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
| 971 | kleidicv_thread_multithreading mt) { | ||
| 972 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
|
416 | if (!kleidicv::remap_s16_is_implemented<uint8_t>(src_stride, src_width, |
| 973 | 208 | src_height, dst_width, | |
| 974 | 208 | border_type, channels)) { | |
| 975 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 976 | } | ||
| 977 | 488 | auto callback = [=](unsigned begin, unsigned end) { | |
| 978 | 576 | return kleidicv_remap_s16_u8( | |
| 979 | 288 | src, src_stride, src_width, src_height, | |
| 980 | 288 | dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width, | |
| 981 | 288 | end - begin, channels, | |
| 982 | 288 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
| 983 | 288 | mapxy_stride, border_type, border_value); | |
| 984 | }; | ||
| 985 | 200 | return parallel_batches(callback, mt, dst_height); | |
| 986 | 208 | } | |
| 987 | |||
| 988 | 208 | kleidicv_error_t kleidicv_thread_remap_s16_u16( | |
| 989 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 990 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 991 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
| 992 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
| 993 | kleidicv_thread_multithreading mt) { | ||
| 994 |
4/4✓ Branch 0 taken 8 times.
✓ Branch 1 taken 200 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 200 times.
|
416 | if (!kleidicv::remap_s16_is_implemented<uint16_t>(src_stride, src_width, |
| 995 | 208 | src_height, dst_width, | |
| 996 | 208 | border_type, channels)) { | |
| 997 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 998 | } | ||
| 999 | 488 | auto callback = [=](unsigned begin, unsigned end) { | |
| 1000 | 576 | return kleidicv_remap_s16_u16( | |
| 1001 | 288 | src, src_stride, src_width, src_height, | |
| 1002 | 288 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
| 1003 | 288 | dst_stride, dst_width, end - begin, channels, | |
| 1004 | 288 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
| 1005 | 288 | mapxy_stride, border_type, border_value); | |
| 1006 | }; | ||
| 1007 | 200 | return parallel_batches(callback, mt, dst_height); | |
| 1008 | 208 | } | |
| 1009 | |||
| 1010 | 408 | kleidicv_error_t kleidicv_thread_remap_s16point5_u8( | |
| 1011 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1012 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1013 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
| 1014 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
| 1015 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
| 1016 | kleidicv_thread_multithreading mt) { | ||
| 1017 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
|
408 | if (!kleidicv::remap_s16point5_is_implemented<uint8_t>( |
| 1018 | 408 | src_stride, src_width, src_height, dst_width, border_type, | |
| 1019 | 408 | channels)) { | |
| 1020 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1021 | } | ||
| 1022 | 976 | auto callback = [=](unsigned begin, unsigned end) { | |
| 1023 | 1152 | return kleidicv_remap_s16point5_u8( | |
| 1024 | 576 | src, src_stride, src_width, src_height, | |
| 1025 | 576 | dst + begin * dst_stride / sizeof(uint8_t), dst_stride, dst_width, | |
| 1026 | 576 | end - begin, channels, | |
| 1027 | 576 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
| 1028 | 576 | mapxy_stride, | |
| 1029 | 1152 | mapfrac + | |
| 1030 | 576 | static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)), | |
| 1031 | 576 | mapfrac_stride, border_type, border_value); | |
| 1032 | }; | ||
| 1033 | 400 | return parallel_batches(callback, mt, dst_height); | |
| 1034 | 408 | } | |
| 1035 | |||
| 1036 | 408 | kleidicv_error_t kleidicv_thread_remap_s16point5_u16( | |
| 1037 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1038 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1039 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, | ||
| 1040 | const uint16_t *mapfrac, size_t mapfrac_stride, | ||
| 1041 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
| 1042 | kleidicv_thread_multithreading mt) { | ||
| 1043 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
|
408 | if (!kleidicv::remap_s16point5_is_implemented<uint16_t>( |
| 1044 | 408 | src_stride, src_width, src_height, dst_width, border_type, | |
| 1045 | 408 | channels)) { | |
| 1046 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1047 | } | ||
| 1048 | 976 | auto callback = [=](unsigned begin, unsigned end) { | |
| 1049 | 1152 | return kleidicv_remap_s16point5_u16( | |
| 1050 | 576 | src, src_stride, src_width, src_height, | |
| 1051 | 576 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
| 1052 | 576 | dst_stride, dst_width, end - begin, channels, | |
| 1053 | 576 | mapxy + static_cast<ptrdiff_t>(begin * mapxy_stride / sizeof(int16_t)), | |
| 1054 | 576 | mapxy_stride, | |
| 1055 | 1152 | mapfrac + | |
| 1056 | 576 | static_cast<ptrdiff_t>(begin * mapfrac_stride / sizeof(uint16_t)), | |
| 1057 | 576 | mapfrac_stride, border_type, border_value); | |
| 1058 | }; | ||
| 1059 | 400 | return parallel_batches(callback, mt, dst_height); | |
| 1060 | 408 | } | |
| 1061 | |||
| 1062 | 408 | kleidicv_error_t kleidicv_thread_remap_f32_u8( | |
| 1063 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1064 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1065 | size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, | ||
| 1066 | size_t mapy_stride, kleidicv_interpolation_type_t interpolation, | ||
| 1067 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
| 1068 | kleidicv_thread_multithreading mt) { | ||
| 1069 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
|
408 | if (!kleidicv::remap_f32_is_implemented<uint8_t>( |
| 1070 | 408 | src_stride, src_width, src_height, dst_width, dst_height, border_type, | |
| 1071 | 408 | channels, interpolation)) { | |
| 1072 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1073 | } | ||
| 1074 | 976 | auto callback = [=](unsigned begin, unsigned end) { | |
| 1075 | 1152 | return kleidicv_remap_f32_u8( | |
| 1076 | 576 | src, src_stride, src_width, src_height, | |
| 1077 | 576 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint8_t)), | |
| 1078 | 576 | dst_stride, dst_width, end - begin, channels, | |
| 1079 | 576 | mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)), | |
| 1080 | 576 | mapx_stride, | |
| 1081 | 576 | mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)), | |
| 1082 | 576 | mapy_stride, interpolation, border_type, border_value); | |
| 1083 | }; | ||
| 1084 | 400 | return parallel_batches(callback, mt, dst_height); | |
| 1085 | 408 | } | |
| 1086 | |||
| 1087 | 408 | kleidicv_error_t kleidicv_thread_remap_f32_u16( | |
| 1088 | const uint16_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1089 | uint16_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1090 | size_t channels, const float *mapx, size_t mapx_stride, const float *mapy, | ||
| 1091 | size_t mapy_stride, kleidicv_interpolation_type_t interpolation, | ||
| 1092 | kleidicv_border_type_t border_type, const uint16_t *border_value, | ||
| 1093 | kleidicv_thread_multithreading mt) { | ||
| 1094 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 400 times.
|
408 | if (!kleidicv::remap_f32_is_implemented<uint16_t>( |
| 1095 | 408 | src_stride, src_width, src_height, dst_width, dst_height, border_type, | |
| 1096 | 408 | channels, interpolation)) { | |
| 1097 | 8 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1098 | } | ||
| 1099 | 976 | auto callback = [=](unsigned begin, unsigned end) { | |
| 1100 | 1152 | return kleidicv_remap_f32_u16( | |
| 1101 | 576 | src, src_stride, src_width, src_height, | |
| 1102 | 576 | dst + static_cast<ptrdiff_t>(begin * dst_stride / sizeof(uint16_t)), | |
| 1103 | 576 | dst_stride, dst_width, end - begin, channels, | |
| 1104 | 576 | mapx + static_cast<ptrdiff_t>(begin * mapx_stride / sizeof(float)), | |
| 1105 | 576 | mapx_stride, | |
| 1106 | 576 | mapy + static_cast<ptrdiff_t>(begin * mapy_stride / sizeof(float)), | |
| 1107 | 576 | mapy_stride, interpolation, border_type, border_value); | |
| 1108 | }; | ||
| 1109 | 400 | return parallel_batches(callback, mt, dst_height); | |
| 1110 | 408 | } | |
| 1111 | |||
| 1112 | 216 | kleidicv_error_t kleidicv_thread_warp_perspective_u8( | |
| 1113 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 1114 | uint8_t *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
| 1115 | const float transformation[9], size_t channels, | ||
| 1116 | kleidicv_interpolation_type_t interpolation, | ||
| 1117 | kleidicv_border_type_t border_type, const uint8_t *border_value, | ||
| 1118 | kleidicv_thread_multithreading mt) { | ||
| 1119 |
2/2✓ Branch 0 taken 16 times.
✓ Branch 1 taken 200 times.
|
216 | if (!kleidicv::warp_perspective_is_implemented<uint8_t>( |
| 1120 | 216 | dst_width, channels, interpolation, border_type)) { | |
| 1121 | 16 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1122 | } | ||
| 1123 | |||
| 1124 | 488 | auto callback = [=](unsigned y_begin, unsigned y_end) { | |
| 1125 | 576 | return kleidicv_warp_perspective_stripe_u8( | |
| 1126 | 288 | src, src_stride, src_width, src_height, dst, dst_stride, dst_width, | |
| 1127 | 288 | dst_height, y_begin, std::min<size_t>(dst_height, y_end + 1), | |
| 1128 | 288 | transformation, channels, interpolation, border_type, border_value); | |
| 1129 | }; | ||
| 1130 | 200 | return parallel_batches(callback, mt, dst_height); | |
| 1131 | 216 | } | |
| 1132 |