| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cstddef> | ||
| 6 | #include <cstdint> | ||
| 7 | #include <cstdlib> | ||
| 8 | #include <memory> | ||
| 9 | |||
| 10 | #include "kleidicv/config.h" | ||
| 11 | #include "kleidicv/ctypes.h" | ||
| 12 | #include "kleidicv/filters/scharr.h" | ||
| 13 | #include "kleidicv/neon.h" | ||
| 14 | #include "kleidicv/types.h" | ||
| 15 | #include "kleidicv/utils.h" | ||
| 16 | |||
| 17 | namespace kleidicv::neon { | ||
| 18 | |||
| 19 | // Scharr filtering in both horizontal and vertical directions, horizontal and | ||
| 20 | // vertical derivative approximations are stored interleaved. | ||
| 21 | // | ||
| 22 | // The applied weights for the horizontal approximation, as the kernel is | ||
| 23 | // mirrored both vertically and horizontally during the convolution: | ||
| 24 | // [ -3 0 3 ] [ 3 ] | ||
| 25 | // F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ] | ||
| 26 | // [ -3 0 3 ] [ 3 ] | ||
| 27 | // | ||
| 28 | // The applied weights for the vertical approximation, as the kernel is mirrored | ||
| 29 | // both vertically and horizontally during the convolution: | ||
| 30 | // [ -3 -10 -3 ] [ -1 ] | ||
| 31 | // F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ] | ||
| 32 | // [ 3 10 3 ] [ 1 ] | ||
| 33 | // | ||
| 34 | class ScharrInterleaved { | ||
| 35 | using SourceType = uint8_t; | ||
| 36 | using SourceVecTraits = VecTraits<SourceType>; | ||
| 37 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
| 38 | using SourceVector2Type = typename SourceVecTraits::Vector2Type; | ||
| 39 | using BufferType = int16_t; | ||
| 40 | using BufferVecTraits = VecTraits<BufferType>; | ||
| 41 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
| 42 | using BufferVector2Type = typename BufferVecTraits::Vector2Type; | ||
| 43 | using BufferVector4Type = typename BufferVecTraits::Vector4Type; | ||
| 44 | using DestinationType = int16_t; | ||
| 45 | using DestinationVecTraits = VecTraits<DestinationType>; | ||
| 46 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
| 47 | using DestinationVector2Type = typename DestinationVecTraits::Vector2Type; | ||
| 48 | |||
| 49 | public: | ||
| 50 | 330 | ScharrInterleaved(Rows<BufferType> hori_deriv_buffer, | |
| 51 | Rows<BufferType> vert_deriv_buffer, size_t width) | ||
| 52 | 330 | : hori_deriv_buffer_(hori_deriv_buffer), | |
| 53 | 330 | vert_deriv_buffer_(vert_deriv_buffer), | |
| 54 | 330 | width_(width), | |
| 55 | 330 | const_3_s16_(vdupq_n_s16(3)), | |
| 56 | 330 | const_10_u8_(vdupq_n_u8(10)), | |
| 57 | 330 | const_10_s16_(vdupq_n_s16(10)) {} | |
| 58 | |||
| 59 | 330 | KLEIDICV_FORCE_INLINE void process(Rows<const SourceType> src_rows, | |
| 60 | Rows<DestinationType> dst_rows, | ||
| 61 | size_t y_begin, size_t y_end) { | ||
| 62 |
2/2✓ Branch 0 taken 330 times.
✓ Branch 1 taken 5760 times.
|
6090 | for (size_t i = y_begin; i < y_end; ++i) { |
| 63 | 5760 | process_vertical(src_rows.at(static_cast<ptrdiff_t>(i))); | |
| 64 | 5760 | process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i))); | |
| 65 | 5760 | } | |
| 66 | 330 | } | |
| 67 | |||
| 68 | private: | ||
| 69 | KLEIDICV_FORCE_INLINE BufferVector4Type | ||
| 70 | 3772 | vertical_vector_path(SourceVectorType src[3]) { | |
| 71 | // Horizontal derivative approximation | ||
| 72 | 3772 | uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2])); | |
| 73 | 3772 | uint16x8_t hori_acc_h = vaddl_high_u8(src[0], src[2]); | |
| 74 | |||
| 75 | 3772 | hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_); | |
| 76 | 3772 | hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_); | |
| 77 | |||
| 78 | 3772 | hori_acc_l = | |
| 79 | 3772 | vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_)); | |
| 80 | 3772 | hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_); | |
| 81 | |||
| 82 | // Vertical derivative approximation | ||
| 83 | 3772 | uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0])); | |
| 84 | 3772 | uint16x8_t vert_acc_h = vsubl_high_u8(src[2], src[0]); | |
| 85 | |||
| 86 | 3772 | return { | |
| 87 | 11316 | vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h), | |
| 88 | 7544 | vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)}; | |
| 89 | 3772 | } | |
| 90 | |||
| 91 | 5760 | KLEIDICV_FORCE_INLINE void process_vertical(Rows<const SourceType> src_rows) { | |
| 92 | 5760 | LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes}; | |
| 93 | |||
| 94 | 6930 | loop.unroll_twice([&](ptrdiff_t index) { | |
| 95 | 1170 | SourceVector2Type src[3]; | |
| 96 | 1170 | SourceVecTraits::load(&src_rows.at(0)[index], src[0]); | |
| 97 | 1170 | SourceVecTraits::load(&src_rows.at(1)[index], src[1]); | |
| 98 | 1170 | SourceVecTraits::load(&src_rows.at(2)[index], src[2]); | |
| 99 | |||
| 100 | 1170 | SourceVectorType src_a[3] = {src[0].val[0], src[1].val[0], src[2].val[0]}; | |
| 101 | 1170 | BufferVector4Type res_a = vertical_vector_path(src_a); | |
| 102 | |||
| 103 | 1170 | SourceVectorType src_b[3] = {src[0].val[1], src[1].val[1], src[2].val[1]}; | |
| 104 | 1170 | BufferVector4Type res_b = vertical_vector_path(src_b); | |
| 105 | |||
| 106 | 2340 | BufferVector4Type hori_derivs = {res_a.val[0], res_a.val[1], res_b.val[0], | |
| 107 | 1170 | res_b.val[1]}; | |
| 108 | 2340 | BufferVector4Type vert_derivs = {res_a.val[2], res_a.val[3], res_b.val[2], | |
| 109 | 1170 | res_b.val[3]}; | |
| 110 | |||
| 111 | 1170 | BufferVecTraits::store(hori_derivs, &hori_deriv_buffer_[index]); | |
| 112 | 1170 | BufferVecTraits::store(vert_derivs, &vert_deriv_buffer_[index]); | |
| 113 | 1170 | }); | |
| 114 | |||
| 115 | 7192 | loop.unroll_once([&](ptrdiff_t index) { | |
| 116 | 1432 | SourceVectorType src[3]; | |
| 117 | 1432 | SourceVecTraits::load(&src_rows.at(0)[index], src[0]); | |
| 118 | 1432 | SourceVecTraits::load(&src_rows.at(1)[index], src[1]); | |
| 119 | 1432 | SourceVecTraits::load(&src_rows.at(2)[index], src[2]); | |
| 120 | |||
| 121 | 1432 | BufferVector4Type res = vertical_vector_path(src); | |
| 122 | |||
| 123 | 1432 | BufferVector2Type hori_pair = {res.val[0], res.val[1]}; | |
| 124 | 1432 | BufferVector2Type vert_pair = {res.val[2], res.val[3]}; | |
| 125 | |||
| 126 | 1432 | BufferVecTraits::store(hori_pair, &hori_deriv_buffer_[index]); | |
| 127 | 1432 | BufferVecTraits::store(vert_pair, &vert_deriv_buffer_[index]); | |
| 128 | 1432 | }); | |
| 129 | |||
| 130 | 46496 | loop.tail([&](ptrdiff_t index) { | |
| 131 | 40736 | hori_deriv_buffer_[index] = static_cast<BufferType>( | |
| 132 | 81472 | (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 + | |
| 133 | 40736 | src_rows.at(1)[index] * 10); | |
| 134 | |||
| 135 | 40736 | vert_deriv_buffer_[index] = static_cast<BufferType>( | |
| 136 | 40736 | src_rows.at(2)[index] - src_rows.at(0)[index]); | |
| 137 | 40736 | }); | |
| 138 | 5760 | } | |
| 139 | |||
| 140 | KLEIDICV_FORCE_INLINE DestinationVectorType | ||
| 141 | 7272 | horizontal_vector_path_hori_approx(BufferVectorType buff[2]) { | |
| 142 | 7272 | return vsubq_s16(buff[1], buff[0]); | |
| 143 | } | ||
| 144 | |||
| 145 | KLEIDICV_FORCE_INLINE DestinationVectorType | ||
| 146 | 7272 | horizontal_vector_path_vert_approx(BufferVectorType buff[3]) { | |
| 147 | 7272 | BufferVectorType a = vaddq_u16(buff[0], buff[2]); | |
| 148 | 7272 | a = vaddq_u16(a, vaddq_u16(a, a)); | |
| 149 | 14544 | return vmlaq_u16(a, buff[1], const_10_s16_); | |
| 150 | 7272 | } | |
| 151 | |||
| 152 | 5760 | KLEIDICV_FORCE_INLINE void process_horizontal( | |
| 153 | Rows<DestinationType> dst_rows) { | ||
| 154 | // width is decremented by 2 as the result has less columns. | ||
| 155 | 5760 | LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), | |
| 156 | kBufferVecNumLanes}; | ||
| 157 | 11520 | const ptrdiff_t channel = | |
| 158 | 5760 | static_cast<ptrdiff_t>(hori_deriv_buffer_.channels()); | |
| 159 | |||
| 160 | 8678 | loop.unroll_twice([&](ptrdiff_t index) { | |
| 161 | 2918 | BufferVector2Type hori_buff[2]; | |
| 162 | 2918 | BufferVecTraits::load(&hori_deriv_buffer_[index], hori_buff[0]); | |
| 163 | 5836 | BufferVecTraits::load(&hori_deriv_buffer_[index + channel * 2], | |
| 164 | 2918 | hori_buff[1]); | |
| 165 | |||
| 166 | 5836 | BufferVectorType hori_buff_a[2] = {hori_buff[0].val[0], | |
| 167 | 2918 | hori_buff[1].val[0]}; | |
| 168 | 5836 | DestinationVectorType hori_approx_res_a = | |
| 169 | 2918 | horizontal_vector_path_hori_approx(hori_buff_a); | |
| 170 | |||
| 171 | 5836 | BufferVectorType hori_buff_b[2] = {hori_buff[0].val[1], | |
| 172 | 2918 | hori_buff[1].val[1]}; | |
| 173 | 5836 | DestinationVectorType hori_approx_res_b = | |
| 174 | 2918 | horizontal_vector_path_hori_approx(hori_buff_b); | |
| 175 | |||
| 176 | 2918 | BufferVector2Type vert_buff[3]; | |
| 177 | 2918 | BufferVecTraits::load(&vert_deriv_buffer_[index], vert_buff[0]); | |
| 178 | 2918 | BufferVecTraits::load(&vert_deriv_buffer_[index + channel], vert_buff[1]); | |
| 179 | 5836 | BufferVecTraits::load(&vert_deriv_buffer_[index + channel * 2], | |
| 180 | 2918 | vert_buff[2]); | |
| 181 | |||
| 182 | 11672 | BufferVectorType vert_buff_a[3] = { | |
| 183 | 8754 | vert_buff[0].val[0], vert_buff[1].val[0], vert_buff[2].val[0]}; | |
| 184 | 5836 | DestinationVectorType vert_approx_res_a = | |
| 185 | 2918 | horizontal_vector_path_vert_approx(vert_buff_a); | |
| 186 | |||
| 187 | 11672 | BufferVectorType vert_buff_b[3] = { | |
| 188 | 8754 | vert_buff[0].val[1], vert_buff[1].val[1], vert_buff[2].val[1]}; | |
| 189 | 5836 | DestinationVectorType vert_approx_res_b = | |
| 190 | 2918 | horizontal_vector_path_vert_approx(vert_buff_b); | |
| 191 | |||
| 192 | 2918 | vst2q(&dst_rows[index * 2], {hori_approx_res_a, vert_approx_res_a}); | |
| 193 | 5836 | vst2q(&dst_rows[(index + kBufferVecNumLanes) * 2], | |
| 194 | 2918 | {hori_approx_res_b, vert_approx_res_b}); | |
| 195 | 2918 | }); | |
| 196 | |||
| 197 | 7196 | loop.unroll_once([&](ptrdiff_t index) { | |
| 198 | 1436 | BufferVectorType hori_buff[2]; | |
| 199 | 1436 | BufferVecTraits::load(&hori_deriv_buffer_[index], hori_buff[0]); | |
| 200 | 2872 | BufferVecTraits::load(&hori_deriv_buffer_[index + channel * 2], | |
| 201 | 1436 | hori_buff[1]); | |
| 202 | 2872 | DestinationVectorType hori_approx_res = | |
| 203 | 1436 | horizontal_vector_path_hori_approx(hori_buff); | |
| 204 | |||
| 205 | 1436 | BufferVectorType vert_buff[3]; | |
| 206 | 1436 | BufferVecTraits::load(&vert_deriv_buffer_[index], vert_buff[0]); | |
| 207 | 1436 | BufferVecTraits::load(&vert_deriv_buffer_[index + channel], vert_buff[1]); | |
| 208 | 2872 | BufferVecTraits::load(&vert_deriv_buffer_[index + channel * 2], | |
| 209 | 1436 | vert_buff[2]); | |
| 210 | 2872 | DestinationVectorType vert_approx_res = | |
| 211 | 1436 | horizontal_vector_path_vert_approx(vert_buff); | |
| 212 | |||
| 213 | 1436 | vst2q(&dst_rows[index * 2], {hori_approx_res, vert_approx_res}); | |
| 214 | 1436 | }); | |
| 215 | |||
| 216 | 19960 | loop.tail([&](ptrdiff_t index) { | |
| 217 | 14200 | dst_rows[index * 2] = static_cast<DestinationType>( | |
| 218 | // For some reason clang-tidy thinks these accesses are invalid | ||
| 219 | // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign, | ||
| 220 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
| 221 | 14200 | hori_deriv_buffer_[index + channel * 2] - hori_deriv_buffer_[index]); | |
| 222 | // NOLINTEND(clang-analyzer-core.uninitialized.Assign, | ||
| 223 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
| 224 | |||
| 225 | 14200 | dst_rows[index * 2 + 1] = static_cast<DestinationType>( | |
| 226 | 42600 | (vert_deriv_buffer_[index] + | |
| 227 | 28400 | vert_deriv_buffer_[index + channel * 2]) * | |
| 228 | 14200 | 3 + | |
| 229 | 14200 | vert_deriv_buffer_[index + channel] * 10); | |
| 230 | 14200 | }); | |
| 231 | 5760 | } | |
| 232 | |||
| 233 | Rows<BufferType> hori_deriv_buffer_; | ||
| 234 | Rows<BufferType> vert_deriv_buffer_; | ||
| 235 | size_t width_; | ||
| 236 | int16x8_t const_3_s16_; | ||
| 237 | uint8x16_t const_10_u8_; | ||
| 238 | int16x8_t const_10_s16_; | ||
| 239 | |||
| 240 | static constexpr ptrdiff_t kSourceVecNumLanes = | ||
| 241 | static_cast<ptrdiff_t>(SourceVecTraits::num_lanes()); | ||
| 242 | static constexpr ptrdiff_t kBufferVecNumLanes = | ||
| 243 | static_cast<ptrdiff_t>(BufferVecTraits::num_lanes()); | ||
| 244 | }; // end of class ScharrInterleaved | ||
| 245 | |||
| 246 | class ScharrBufferDeleter { | ||
| 247 | public: | ||
| 248 | 330 | void operator()(void *ptr) const { std::free(ptr); } | |
| 249 | }; | ||
| 250 | |||
| 251 | KLEIDICV_TARGET_FN_ATTRS | ||
| 252 | 336 | kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( | |
| 253 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 254 | size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, | ||
| 255 | size_t y_end) { | ||
| 256 | // Does not include checks for whether the operation is implemented. | ||
| 257 | // This must be done earlier, by scharr_interleaved_is_implemented. | ||
| 258 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 335 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 335 times.
|
336 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 259 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 333 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 333 times.
|
335 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height); |
| 260 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 332 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 331 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 331 times.
|
333 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 261 | |||
| 262 | 331 | size_t buffer_stride = src_width * src_channels * sizeof(int16_t); | |
| 263 | // Buffer has two rows, one for the horizontal derivative approximation, one | ||
| 264 | // for the vertical one. | ||
| 265 | 331 | size_t buffer_height = 2; | |
| 266 | // Memory is allocated with malloc to avoid its initialization. | ||
| 267 | 331 | void *allocation = std::malloc(buffer_stride * buffer_height); | |
| 268 | |||
| 269 |
2/2✓ Branch 0 taken 330 times.
✓ Branch 1 taken 1 times.
|
331 | if (!allocation) { |
| 270 | 1 | return KLEIDICV_ERROR_ALLOCATION; | |
| 271 | } | ||
| 272 | |||
| 273 | 660 | std::unique_ptr<int16_t, ScharrBufferDeleter> buffer( | |
| 274 | 330 | reinterpret_cast<int16_t *>(allocation)); | |
| 275 | |||
| 276 | 330 | Rows<const uint8_t> src_rows{src, src_stride, src_channels}; | |
| 277 | |||
| 278 | // Result is treated as it has double the channel number compared to the | ||
| 279 | // input. | ||
| 280 | 330 | Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2}; | |
| 281 | |||
| 282 | 330 | Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels}; | |
| 283 | |||
| 284 | 660 | int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>( | |
| 285 | 330 | reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride); | |
| 286 | 330 | Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels}; | |
| 287 | |||
| 288 | 660 | ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width) | |
| 289 | 330 | .process(src_rows, dst_rows, y_begin, y_end); | |
| 290 | |||
| 291 | 330 | return KLEIDICV_OK; | |
| 292 | 336 | } | |
| 293 | |||
| 294 | } // namespace kleidicv::neon | ||
| 295 |