| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cstddef> | ||
| 6 | #include <cstdint> | ||
| 7 | #include <cstdlib> | ||
| 8 | #include <memory> | ||
| 9 | |||
| 10 | #include "kleidicv/config.h" | ||
| 11 | #include "kleidicv/ctypes.h" | ||
| 12 | #include "kleidicv/filters/scharr.h" | ||
| 13 | #include "kleidicv/neon.h" | ||
| 14 | #include "kleidicv/types.h" | ||
| 15 | #include "kleidicv/utils.h" | ||
| 16 | |||
| 17 | namespace kleidicv::neon { | ||
| 18 | |||
| 19 | // Scharr filtering in both horizontal and vertical directions, horizontal and | ||
| 20 | // vertical derivative approximations are stored interleaved. | ||
| 21 | // | ||
| 22 | // The applied weights for the horizontal approximation, as the kernel is | ||
| 23 | // mirrored both vertically and horizontally during the convolution: | ||
| 24 | // [ -3 0 3 ] [ 3 ] | ||
| 25 | // F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ] | ||
| 26 | // [ -3 0 3 ] [ 3 ] | ||
| 27 | // | ||
| 28 | // The applied weights for the vertical approximation, as the kernel is mirrored | ||
| 29 | // both vertically and horizontally during the convolution: | ||
| 30 | // [ -3 -10 -3 ] [ -1 ] | ||
| 31 | // F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ] | ||
| 32 | // [ 3 10 3 ] [ 1 ] | ||
| 33 | // | ||
| 34 | class ScharrInterleaved { | ||
| 35 | using SourceType = uint8_t; | ||
| 36 | using SourceVecTraits = VecTraits<SourceType>; | ||
| 37 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
| 38 | using BufferType = int16_t; | ||
| 39 | using BufferVecTraits = VecTraits<BufferType>; | ||
| 40 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
| 41 | using BufferVector4Type = typename BufferVecTraits::Vector4Type; | ||
| 42 | using DestinationType = int16_t; | ||
| 43 | using DestinationVecTraits = VecTraits<DestinationType>; | ||
| 44 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
| 45 | |||
| 46 | public: | ||
| 47 | 67 | ScharrInterleaved(Rows<int16_t> hori_deriv_buffer, | |
| 48 | Rows<int16_t> vert_deriv_buffer, size_t width) | ||
| 49 | 67 | : hori_deriv_buffer_(hori_deriv_buffer), | |
| 50 | 67 | vert_deriv_buffer_(vert_deriv_buffer), | |
| 51 | 67 | width_(width), | |
| 52 | 67 | const_3_s16_(vdupq_n_s16(3)), | |
| 53 | 67 | const_10_u8_(vdupq_n_u8(10)), | |
| 54 | 67 | const_10_s16_(vdupq_n_s16(10)) {} | |
| 55 | |||
| 56 | 67 | void process(Rows<const uint8_t> src_rows, Rows<int16_t> dst_rows, | |
| 57 | size_t y_begin, size_t y_end) { | ||
| 58 |
2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1462 times.
|
1529 | for (size_t i = y_begin; i < y_end; ++i) { |
| 59 | 1462 | process_vertical(src_rows.at(static_cast<ptrdiff_t>(i))); | |
| 60 | 1462 | process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i))); | |
| 61 | 1462 | } | |
| 62 | 67 | } | |
| 63 | |||
| 64 | private: | ||
| 65 | 151 | BufferVector4Type vertical_vector_path(SourceVectorType src[3]) { | |
| 66 | // Horizontal derivative approximation | ||
| 67 | 151 | uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2])); | |
| 68 | 302 | uint16x8_t hori_acc_h = | |
| 69 | 151 | vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2])); | |
| 70 | |||
| 71 | 151 | hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_); | |
| 72 | 151 | hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_); | |
| 73 | |||
| 74 | 151 | hori_acc_l = | |
| 75 | 151 | vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_)); | |
| 76 | 151 | hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_); | |
| 77 | |||
| 78 | // Vertical derivative approximation | ||
| 79 | 151 | uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0])); | |
| 80 | 302 | uint16x8_t vert_acc_h = | |
| 81 | 151 | vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0])); | |
| 82 | |||
| 83 | 151 | return { | |
| 84 | 453 | vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h), | |
| 85 | 302 | vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)}; | |
| 86 | 151 | } | |
| 87 | |||
| 88 | 1462 | void process_vertical(Rows<const uint8_t> src_rows) { | |
| 89 | 1462 | LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes}; | |
| 90 | |||
| 91 | 1613 | loop.unroll_once([&](ptrdiff_t index) { | |
| 92 | 151 | SourceVectorType src[3]; | |
| 93 | 151 | src[0] = vld1q(&src_rows.at(0)[index]); | |
| 94 | 151 | src[1] = vld1q(&src_rows.at(1)[index]); | |
| 95 | 151 | src[2] = vld1q(&src_rows.at(2)[index]); | |
| 96 | |||
| 97 | 151 | BufferVector4Type res = vertical_vector_path(src); | |
| 98 | |||
| 99 | 151 | vst1q(&hori_deriv_buffer_[index], res.val[0]); | |
| 100 | 151 | vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]); | |
| 101 | 151 | vst1q(&vert_deriv_buffer_[index], res.val[2]); | |
| 102 | 151 | vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]); | |
| 103 | 151 | }); | |
| 104 | |||
| 105 | 9405 | loop.tail([&](ptrdiff_t index) { | |
| 106 | 7943 | hori_deriv_buffer_[index] = static_cast<BufferType>( | |
| 107 | 15886 | (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 + | |
| 108 | 7943 | src_rows.at(1)[index] * 10); | |
| 109 | |||
| 110 | 7943 | vert_deriv_buffer_[index] = static_cast<BufferType>( | |
| 111 | 7943 | src_rows.at(2)[index] - src_rows.at(0)[index]); | |
| 112 | 7943 | }); | |
| 113 | 1462 | } | |
| 114 | |||
| 115 | 500 | DestinationVectorType horizontal_vector_path_hori_approx( | |
| 116 | BufferVectorType buff[2]) { | ||
| 117 | 500 | return vsubq_s16(buff[1], buff[0]); | |
| 118 | } | ||
| 119 | |||
| 120 | 500 | DestinationVectorType horizontal_vector_path_vert_approx( | |
| 121 | BufferVectorType buff[3]) { | ||
| 122 | 500 | BufferVectorType a = vaddq_u16(buff[0], buff[2]); | |
| 123 | 500 | a = vaddq_u16(a, vaddq_u16(a, a)); | |
| 124 | 1000 | return vmlaq_u16(a, buff[1], const_10_s16_); | |
| 125 | 500 | } | |
| 126 | |||
| 127 | 1462 | void process_horizontal(Rows<int16_t> dst_rows) { | |
| 128 | // width is decremented by 2 as the result has less columns. | ||
| 129 | 1462 | LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), | |
| 130 | kBufferVecNumLanes}; | ||
| 131 | |||
| 132 | 1962 | loop.unroll_once([&](ptrdiff_t index) { | |
| 133 | 500 | BufferVectorType hori_buff[2]; | |
| 134 | 500 | hori_buff[0] = vld1q(&hori_deriv_buffer_[index]); | |
| 135 | 500 | hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]); | |
| 136 | 1000 | DestinationVectorType hori_approx_res = | |
| 137 | 500 | horizontal_vector_path_hori_approx(hori_buff); | |
| 138 | |||
| 139 | 500 | BufferVectorType vert_buff[3]; | |
| 140 | 500 | vert_buff[0] = vld1q(&vert_deriv_buffer_[index]); | |
| 141 | 500 | vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]); | |
| 142 | 500 | vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]); | |
| 143 | 1000 | DestinationVectorType vert_approx_res = | |
| 144 | 500 | horizontal_vector_path_vert_approx(vert_buff); | |
| 145 | |||
| 146 | 1000 | vst1q(&dst_rows.at(0, index)[0], | |
| 147 | 500 | vzip1q_s16(hori_approx_res, vert_approx_res)); | |
| 148 | 1000 | vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()], | |
| 149 | 500 | vzip2q_s16(hori_approx_res, vert_approx_res)); | |
| 150 | 500 | }); | |
| 151 | |||
| 152 | 4897 | loop.tail([&](ptrdiff_t index) { | |
| 153 | 3435 | dst_rows.at(0, index)[0] = static_cast<DestinationType>( | |
| 154 | // For some reason clang-tidy thinks these accesses are invalid | ||
| 155 | // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign, | ||
| 156 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
| 157 | 3435 | hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]); | |
| 158 | // NOLINTEND(clang-analyzer-core.uninitialized.Assign, | ||
| 159 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
| 160 | |||
| 161 | 3435 | dst_rows.at(0, index)[1] = static_cast<DestinationType>( | |
| 162 | 6870 | (vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 + | |
| 163 | 3435 | vert_deriv_buffer_[index + 1] * 10); | |
| 164 | 3435 | }); | |
| 165 | 1462 | } | |
| 166 | |||
| 167 | Rows<int16_t> hori_deriv_buffer_; | ||
| 168 | Rows<int16_t> vert_deriv_buffer_; | ||
| 169 | size_t width_; | ||
| 170 | int16x8_t const_3_s16_; | ||
| 171 | uint8x16_t const_10_u8_; | ||
| 172 | int16x8_t const_10_s16_; | ||
| 173 | |||
| 174 | static constexpr ptrdiff_t kSourceVecNumLanes = | ||
| 175 | static_cast<ptrdiff_t>(SourceVecTraits::num_lanes()); | ||
| 176 | static constexpr ptrdiff_t kBufferVecNumLanes = | ||
| 177 | static_cast<ptrdiff_t>(BufferVecTraits::num_lanes()); | ||
| 178 | }; // end of class ScharrInterleaved | ||
| 179 | |||
| 180 | class ScharrBufferDeleter { | ||
| 181 | public: | ||
| 182 | 67 | void operator()(void *ptr) const { std::free(ptr); } | |
| 183 | }; | ||
| 184 | |||
| 185 | KLEIDICV_TARGET_FN_ATTRS | ||
| 186 | 73 | kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( | |
| 187 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 188 | size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, | ||
| 189 | size_t y_end) { | ||
| 190 | // Does not include checks for whether the operation is implemented. | ||
| 191 | // This must be done earlier, by scharr_interleaved_is_implemented. | ||
| 192 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 72 times.
|
73 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 193 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 70 times.
|
72 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height); |
| 194 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 69 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 68 times.
|
70 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 195 | |||
| 196 | 68 | size_t buffer_stride = src_width * src_channels * sizeof(int16_t); | |
| 197 | // Buffer has two rows, one for the horizontal derivative approximation, one | ||
| 198 | // for the vertical one. | ||
| 199 | 68 | size_t buffer_height = 2; | |
| 200 | // Memory is allocated with malloc to avoid its initialization. | ||
| 201 | 68 | void *allocation = std::malloc(buffer_stride * buffer_height); | |
| 202 | |||
| 203 |
2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1 times.
|
68 | if (!allocation) { |
| 204 | 1 | return KLEIDICV_ERROR_ALLOCATION; | |
| 205 | } | ||
| 206 | |||
| 207 | 134 | std::unique_ptr<int16_t, ScharrBufferDeleter> buffer( | |
| 208 | 67 | reinterpret_cast<int16_t *>(allocation)); | |
| 209 | |||
| 210 | 67 | Rows<const uint8_t> src_rows{src, src_stride, src_channels}; | |
| 211 | |||
| 212 | // Result is treated as it has double the channel number compared to the | ||
| 213 | // input. | ||
| 214 | 67 | Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2}; | |
| 215 | |||
| 216 | 67 | Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels}; | |
| 217 | |||
| 218 | 134 | int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>( | |
| 219 | 67 | reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride); | |
| 220 | 67 | Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels}; | |
| 221 | |||
| 222 | 134 | ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width) | |
| 223 | 67 | .process(src_rows, dst_rows, y_begin, y_end); | |
| 224 | |||
| 225 | 67 | return KLEIDICV_OK; | |
| 226 | 73 | } | |
| 227 | |||
| 228 | } // namespace kleidicv::neon | ||
| 229 |