Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cstddef> | ||
6 | #include <cstdint> | ||
7 | #include <cstdlib> | ||
8 | #include <memory> | ||
9 | |||
10 | #include "kleidicv/config.h" | ||
11 | #include "kleidicv/ctypes.h" | ||
12 | #include "kleidicv/filters/scharr.h" | ||
13 | #include "kleidicv/neon.h" | ||
14 | #include "kleidicv/types.h" | ||
15 | #include "kleidicv/utils.h" | ||
16 | |||
17 | namespace kleidicv::neon { | ||
18 | |||
19 | // Scharr filtering in both horizontal and vertical directions, horizontal and | ||
20 | // vertical derivative approximations are stored interleaved. | ||
21 | // | ||
22 | // The applied weights for the horizontal approximation, as the kernel is | ||
23 | // mirrored both vertically and horizontally during the convolution: | ||
24 | // [ -3 0 3 ] [ 3 ] | ||
25 | // F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ] | ||
26 | // [ -3 0 3 ] [ 3 ] | ||
27 | // | ||
28 | // The applied weights for the vertical approximation, as the kernel is mirrored | ||
29 | // both vertically and horizontally during the convolution: | ||
30 | // [ -3 -10 -3 ] [ -1 ] | ||
31 | // F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ] | ||
32 | // [ 3 10 3 ] [ 1 ] | ||
33 | // | ||
34 | class ScharrInterleaved { | ||
35 | using SourceType = uint8_t; | ||
36 | using SourceVecTraits = VecTraits<SourceType>; | ||
37 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
38 | using BufferType = int16_t; | ||
39 | using BufferVecTraits = VecTraits<BufferType>; | ||
40 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
41 | using BufferVector4Type = typename BufferVecTraits::Vector4Type; | ||
42 | using DestinationType = int16_t; | ||
43 | using DestinationVecTraits = VecTraits<DestinationType>; | ||
44 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
45 | |||
46 | public: | ||
47 | 67 | ScharrInterleaved(Rows<int16_t> hori_deriv_buffer, | |
48 | Rows<int16_t> vert_deriv_buffer, size_t width) | ||
49 | 67 | : hori_deriv_buffer_(hori_deriv_buffer), | |
50 | 67 | vert_deriv_buffer_(vert_deriv_buffer), | |
51 | 67 | width_(width), | |
52 | 67 | const_3_s16_(vdupq_n_s16(3)), | |
53 | 67 | const_10_u8_(vdupq_n_u8(10)), | |
54 | 67 | const_10_s16_(vdupq_n_s16(10)) {} | |
55 | |||
56 | 67 | void process(Rows<const uint8_t> src_rows, Rows<int16_t> dst_rows, | |
57 | size_t y_begin, size_t y_end) { | ||
58 |
2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1462 times.
|
1529 | for (size_t i = y_begin; i < y_end; ++i) { |
59 | 1462 | process_vertical(src_rows.at(static_cast<ptrdiff_t>(i))); | |
60 | 1462 | process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i))); | |
61 | 1462 | } | |
62 | 67 | } | |
63 | |||
64 | private: | ||
65 | 151 | BufferVector4Type vertical_vector_path(SourceVectorType src[3]) { | |
66 | // Horizontal derivative approximation | ||
67 | 151 | uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2])); | |
68 | 302 | uint16x8_t hori_acc_h = | |
69 | 151 | vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2])); | |
70 | |||
71 | 151 | hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_); | |
72 | 151 | hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_); | |
73 | |||
74 | 151 | hori_acc_l = | |
75 | 151 | vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_)); | |
76 | 151 | hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_); | |
77 | |||
78 | // Vertical derivative approximation | ||
79 | 151 | uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0])); | |
80 | 302 | uint16x8_t vert_acc_h = | |
81 | 151 | vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0])); | |
82 | |||
83 | 151 | return { | |
84 | 453 | vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h), | |
85 | 302 | vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)}; | |
86 | 151 | } | |
87 | |||
88 | 1462 | void process_vertical(Rows<const uint8_t> src_rows) { | |
89 | 1462 | LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes}; | |
90 | |||
91 | 1613 | loop.unroll_once([&](ptrdiff_t index) { | |
92 | 151 | SourceVectorType src[3]; | |
93 | 151 | src[0] = vld1q(&src_rows.at(0)[index]); | |
94 | 151 | src[1] = vld1q(&src_rows.at(1)[index]); | |
95 | 151 | src[2] = vld1q(&src_rows.at(2)[index]); | |
96 | |||
97 | 151 | BufferVector4Type res = vertical_vector_path(src); | |
98 | |||
99 | 151 | vst1q(&hori_deriv_buffer_[index], res.val[0]); | |
100 | 151 | vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]); | |
101 | 151 | vst1q(&vert_deriv_buffer_[index], res.val[2]); | |
102 | 151 | vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]); | |
103 | 151 | }); | |
104 | |||
105 | 9405 | loop.tail([&](ptrdiff_t index) { | |
106 | 7943 | hori_deriv_buffer_[index] = static_cast<BufferType>( | |
107 | 15886 | (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 + | |
108 | 7943 | src_rows.at(1)[index] * 10); | |
109 | |||
110 | 7943 | vert_deriv_buffer_[index] = static_cast<BufferType>( | |
111 | 7943 | src_rows.at(2)[index] - src_rows.at(0)[index]); | |
112 | 7943 | }); | |
113 | 1462 | } | |
114 | |||
115 | 500 | DestinationVectorType horizontal_vector_path_hori_approx( | |
116 | BufferVectorType buff[2]) { | ||
117 | 500 | return vsubq_s16(buff[1], buff[0]); | |
118 | } | ||
119 | |||
120 | 500 | DestinationVectorType horizontal_vector_path_vert_approx( | |
121 | BufferVectorType buff[3]) { | ||
122 | 500 | BufferVectorType a = vaddq_u16(buff[0], buff[2]); | |
123 | 500 | a = vaddq_u16(a, vaddq_u16(a, a)); | |
124 | 1000 | return vmlaq_u16(a, buff[1], const_10_s16_); | |
125 | 500 | } | |
126 | |||
127 | 1462 | void process_horizontal(Rows<int16_t> dst_rows) { | |
128 | // width is decremented by 2 as the result has less columns. | ||
129 | 1462 | LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(), | |
130 | kBufferVecNumLanes}; | ||
131 | |||
132 | 1962 | loop.unroll_once([&](ptrdiff_t index) { | |
133 | 500 | BufferVectorType hori_buff[2]; | |
134 | 500 | hori_buff[0] = vld1q(&hori_deriv_buffer_[index]); | |
135 | 500 | hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]); | |
136 | 1000 | DestinationVectorType hori_approx_res = | |
137 | 500 | horizontal_vector_path_hori_approx(hori_buff); | |
138 | |||
139 | 500 | BufferVectorType vert_buff[3]; | |
140 | 500 | vert_buff[0] = vld1q(&vert_deriv_buffer_[index]); | |
141 | 500 | vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]); | |
142 | 500 | vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]); | |
143 | 1000 | DestinationVectorType vert_approx_res = | |
144 | 500 | horizontal_vector_path_vert_approx(vert_buff); | |
145 | |||
146 | 1000 | vst1q(&dst_rows.at(0, index)[0], | |
147 | 500 | vzip1q_s16(hori_approx_res, vert_approx_res)); | |
148 | 1000 | vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()], | |
149 | 500 | vzip2q_s16(hori_approx_res, vert_approx_res)); | |
150 | 500 | }); | |
151 | |||
152 | 4897 | loop.tail([&](ptrdiff_t index) { | |
153 | 3435 | dst_rows.at(0, index)[0] = static_cast<DestinationType>( | |
154 | // For some reason clang-tidy thinks these accesses are invalid | ||
155 | // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign, | ||
156 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
157 | 3435 | hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]); | |
158 | // NOLINTEND(clang-analyzer-core.uninitialized.Assign, | ||
159 | // clang-analyzer-core.UndefinedBinaryOperatorResult) | ||
160 | |||
161 | 3435 | dst_rows.at(0, index)[1] = static_cast<DestinationType>( | |
162 | 6870 | (vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 + | |
163 | 3435 | vert_deriv_buffer_[index + 1] * 10); | |
164 | 3435 | }); | |
165 | 1462 | } | |
166 | |||
167 | Rows<int16_t> hori_deriv_buffer_; | ||
168 | Rows<int16_t> vert_deriv_buffer_; | ||
169 | size_t width_; | ||
170 | int16x8_t const_3_s16_; | ||
171 | uint8x16_t const_10_u8_; | ||
172 | int16x8_t const_10_s16_; | ||
173 | |||
174 | static constexpr ptrdiff_t kSourceVecNumLanes = | ||
175 | static_cast<ptrdiff_t>(SourceVecTraits::num_lanes()); | ||
176 | static constexpr ptrdiff_t kBufferVecNumLanes = | ||
177 | static_cast<ptrdiff_t>(BufferVecTraits::num_lanes()); | ||
178 | }; // end of class ScharrInterleaved | ||
179 | |||
180 | class ScharrBufferDeleter { | ||
181 | public: | ||
182 | 67 | void operator()(void *ptr) const { std::free(ptr); } | |
183 | }; | ||
184 | |||
185 | KLEIDICV_TARGET_FN_ATTRS | ||
186 | 73 | kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8( | |
187 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
188 | size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin, | ||
189 | size_t y_end) { | ||
190 | // Does not include checks for whether the operation is implemented. | ||
191 | // This must be done earlier, by scharr_interleaved_is_implemented. | ||
192 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 72 times.
|
73 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
193 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 70 times.
|
72 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height); |
194 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 69 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 68 times.
|
70 | CHECK_IMAGE_SIZE(src_width, src_height); |
195 | |||
196 | 68 | size_t buffer_stride = src_width * src_channels * sizeof(int16_t); | |
197 | // Buffer has two rows, one for the horizontal derivative approximation, one | ||
198 | // for the vertical one. | ||
199 | 68 | size_t buffer_height = 2; | |
200 | // Memory is allocated with malloc to avoid its initialization. | ||
201 | 68 | void *allocation = std::malloc(buffer_stride * buffer_height); | |
202 | |||
203 |
2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1 times.
|
68 | if (!allocation) { |
204 | 1 | return KLEIDICV_ERROR_ALLOCATION; | |
205 | } | ||
206 | |||
207 | 134 | std::unique_ptr<int16_t, ScharrBufferDeleter> buffer( | |
208 | 67 | reinterpret_cast<int16_t *>(allocation)); | |
209 | |||
210 | 67 | Rows<const uint8_t> src_rows{src, src_stride, src_channels}; | |
211 | |||
212 | // Result is treated as it has double the channel number compared to the | ||
213 | // input. | ||
214 | 67 | Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2}; | |
215 | |||
216 | 67 | Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels}; | |
217 | |||
218 | 134 | int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>( | |
219 | 67 | reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride); | |
220 | 67 | Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels}; | |
221 | |||
222 | 134 | ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width) | |
223 | 67 | .process(src_rows, dst_rows, y_begin, y_end); | |
224 | |||
225 | 67 | return KLEIDICV_OK; | |
226 | 73 | } | |
227 | |||
228 | } // namespace kleidicv::neon | ||
229 |