KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/scharr_neon.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 153 153 100.0%
Functions: 15 15 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cstddef>
6 #include <cstdint>
7 #include <cstdlib>
8 #include <memory>
9
10 #include "kleidicv/config.h"
11 #include "kleidicv/ctypes.h"
12 #include "kleidicv/filters/scharr.h"
13 #include "kleidicv/neon.h"
14 #include "kleidicv/types.h"
15 #include "kleidicv/utils.h"
16
17 namespace kleidicv::neon {
18
19 // Scharr filtering in both horizontal and vertical directions, horizontal and
20 // vertical derivative approximations are stored interleaved.
21 //
22 // The applied weights for the horizontal approximation, as the kernel is
23 // mirrored both vertically and horizontally during the convolution:
24 // [ -3 0 3 ] [ 3 ]
25 // F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ]
26 // [ -3 0 3 ] [ 3 ]
27 //
28 // The applied weights for the vertical approximation, as the kernel is mirrored
29 // both vertically and horizontally during the convolution:
30 // [ -3 -10 -3 ] [ -1 ]
31 // F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ]
32 // [ 3 10 3 ] [ 1 ]
33 //
34 class ScharrInterleaved {
35 using SourceType = uint8_t;
36 using SourceVecTraits = VecTraits<SourceType>;
37 using SourceVectorType = typename SourceVecTraits::VectorType;
38 using SourceVector2Type = typename SourceVecTraits::Vector2Type;
39 using BufferType = int16_t;
40 using BufferVecTraits = VecTraits<BufferType>;
41 using BufferVectorType = typename BufferVecTraits::VectorType;
42 using BufferVector2Type = typename BufferVecTraits::Vector2Type;
43 using BufferVector4Type = typename BufferVecTraits::Vector4Type;
44 using DestinationType = int16_t;
45 using DestinationVecTraits = VecTraits<DestinationType>;
46 using DestinationVectorType = typename DestinationVecTraits::VectorType;
47 using DestinationVector2Type = typename DestinationVecTraits::Vector2Type;
48
49 public:
50 330 ScharrInterleaved(Rows<BufferType> hori_deriv_buffer,
51 Rows<BufferType> vert_deriv_buffer, size_t width)
52 330 : hori_deriv_buffer_(hori_deriv_buffer),
53 330 vert_deriv_buffer_(vert_deriv_buffer),
54 330 width_(width),
55 330 const_3_s16_(vdupq_n_s16(3)),
56 330 const_10_u8_(vdupq_n_u8(10)),
57 330 const_10_s16_(vdupq_n_s16(10)) {}
58
59 330 KLEIDICV_FORCE_INLINE void process(Rows<const SourceType> src_rows,
60 Rows<DestinationType> dst_rows,
61 size_t y_begin, size_t y_end) {
62
2/2
✓ Branch 0 taken 330 times.
✓ Branch 1 taken 5760 times.
6090 for (size_t i = y_begin; i < y_end; ++i) {
63 5760 process_vertical(src_rows.at(static_cast<ptrdiff_t>(i)));
64 5760 process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i)));
65 5760 }
66 330 }
67
68 private:
69 KLEIDICV_FORCE_INLINE BufferVector4Type
70 3772 vertical_vector_path(SourceVectorType src[3]) {
71 // Horizontal derivative approximation
72 3772 uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2]));
73 3772 uint16x8_t hori_acc_h = vaddl_high_u8(src[0], src[2]);
74
75 3772 hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_);
76 3772 hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_);
77
78 3772 hori_acc_l =
79 3772 vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_));
80 3772 hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_);
81
82 // Vertical derivative approximation
83 3772 uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0]));
84 3772 uint16x8_t vert_acc_h = vsubl_high_u8(src[2], src[0]);
85
86 3772 return {
87 11316 vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h),
88 7544 vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)};
89 3772 }
90
91 5760 KLEIDICV_FORCE_INLINE void process_vertical(Rows<const SourceType> src_rows) {
92 5760 LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes};
93
94 6930 loop.unroll_twice([&](ptrdiff_t index) {
95 1170 SourceVector2Type src[3];
96 1170 SourceVecTraits::load(&src_rows.at(0)[index], src[0]);
97 1170 SourceVecTraits::load(&src_rows.at(1)[index], src[1]);
98 1170 SourceVecTraits::load(&src_rows.at(2)[index], src[2]);
99
100 1170 SourceVectorType src_a[3] = {src[0].val[0], src[1].val[0], src[2].val[0]};
101 1170 BufferVector4Type res_a = vertical_vector_path(src_a);
102
103 1170 SourceVectorType src_b[3] = {src[0].val[1], src[1].val[1], src[2].val[1]};
104 1170 BufferVector4Type res_b = vertical_vector_path(src_b);
105
106 2340 BufferVector4Type hori_derivs = {res_a.val[0], res_a.val[1], res_b.val[0],
107 1170 res_b.val[1]};
108 2340 BufferVector4Type vert_derivs = {res_a.val[2], res_a.val[3], res_b.val[2],
109 1170 res_b.val[3]};
110
111 1170 BufferVecTraits::store(hori_derivs, &hori_deriv_buffer_[index]);
112 1170 BufferVecTraits::store(vert_derivs, &vert_deriv_buffer_[index]);
113 1170 });
114
115 7192 loop.unroll_once([&](ptrdiff_t index) {
116 1432 SourceVectorType src[3];
117 1432 SourceVecTraits::load(&src_rows.at(0)[index], src[0]);
118 1432 SourceVecTraits::load(&src_rows.at(1)[index], src[1]);
119 1432 SourceVecTraits::load(&src_rows.at(2)[index], src[2]);
120
121 1432 BufferVector4Type res = vertical_vector_path(src);
122
123 1432 BufferVector2Type hori_pair = {res.val[0], res.val[1]};
124 1432 BufferVector2Type vert_pair = {res.val[2], res.val[3]};
125
126 1432 BufferVecTraits::store(hori_pair, &hori_deriv_buffer_[index]);
127 1432 BufferVecTraits::store(vert_pair, &vert_deriv_buffer_[index]);
128 1432 });
129
130 46496 loop.tail([&](ptrdiff_t index) {
131 40736 hori_deriv_buffer_[index] = static_cast<BufferType>(
132 81472 (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 +
133 40736 src_rows.at(1)[index] * 10);
134
135 40736 vert_deriv_buffer_[index] = static_cast<BufferType>(
136 40736 src_rows.at(2)[index] - src_rows.at(0)[index]);
137 40736 });
138 5760 }
139
140 KLEIDICV_FORCE_INLINE DestinationVectorType
141 7272 horizontal_vector_path_hori_approx(BufferVectorType buff[2]) {
142 7272 return vsubq_s16(buff[1], buff[0]);
143 }
144
145 KLEIDICV_FORCE_INLINE DestinationVectorType
146 7272 horizontal_vector_path_vert_approx(BufferVectorType buff[3]) {
147 7272 BufferVectorType a = vaddq_u16(buff[0], buff[2]);
148 7272 a = vaddq_u16(a, vaddq_u16(a, a));
149 14544 return vmlaq_u16(a, buff[1], const_10_s16_);
150 7272 }
151
152 5760 KLEIDICV_FORCE_INLINE void process_horizontal(
153 Rows<DestinationType> dst_rows) {
154 // width is decremented by 2 as the result has less columns.
155 5760 LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(),
156 kBufferVecNumLanes};
157 11520 const ptrdiff_t channel =
158 5760 static_cast<ptrdiff_t>(hori_deriv_buffer_.channels());
159
160 8678 loop.unroll_twice([&](ptrdiff_t index) {
161 2918 BufferVector2Type hori_buff[2];
162 2918 BufferVecTraits::load(&hori_deriv_buffer_[index], hori_buff[0]);
163 5836 BufferVecTraits::load(&hori_deriv_buffer_[index + channel * 2],
164 2918 hori_buff[1]);
165
166 5836 BufferVectorType hori_buff_a[2] = {hori_buff[0].val[0],
167 2918 hori_buff[1].val[0]};
168 5836 DestinationVectorType hori_approx_res_a =
169 2918 horizontal_vector_path_hori_approx(hori_buff_a);
170
171 5836 BufferVectorType hori_buff_b[2] = {hori_buff[0].val[1],
172 2918 hori_buff[1].val[1]};
173 5836 DestinationVectorType hori_approx_res_b =
174 2918 horizontal_vector_path_hori_approx(hori_buff_b);
175
176 2918 BufferVector2Type vert_buff[3];
177 2918 BufferVecTraits::load(&vert_deriv_buffer_[index], vert_buff[0]);
178 2918 BufferVecTraits::load(&vert_deriv_buffer_[index + channel], vert_buff[1]);
179 5836 BufferVecTraits::load(&vert_deriv_buffer_[index + channel * 2],
180 2918 vert_buff[2]);
181
182 11672 BufferVectorType vert_buff_a[3] = {
183 8754 vert_buff[0].val[0], vert_buff[1].val[0], vert_buff[2].val[0]};
184 5836 DestinationVectorType vert_approx_res_a =
185 2918 horizontal_vector_path_vert_approx(vert_buff_a);
186
187 11672 BufferVectorType vert_buff_b[3] = {
188 8754 vert_buff[0].val[1], vert_buff[1].val[1], vert_buff[2].val[1]};
189 5836 DestinationVectorType vert_approx_res_b =
190 2918 horizontal_vector_path_vert_approx(vert_buff_b);
191
192 2918 vst2q(&dst_rows[index * 2], {hori_approx_res_a, vert_approx_res_a});
193 5836 vst2q(&dst_rows[(index + kBufferVecNumLanes) * 2],
194 2918 {hori_approx_res_b, vert_approx_res_b});
195 2918 });
196
197 7196 loop.unroll_once([&](ptrdiff_t index) {
198 1436 BufferVectorType hori_buff[2];
199 1436 BufferVecTraits::load(&hori_deriv_buffer_[index], hori_buff[0]);
200 2872 BufferVecTraits::load(&hori_deriv_buffer_[index + channel * 2],
201 1436 hori_buff[1]);
202 2872 DestinationVectorType hori_approx_res =
203 1436 horizontal_vector_path_hori_approx(hori_buff);
204
205 1436 BufferVectorType vert_buff[3];
206 1436 BufferVecTraits::load(&vert_deriv_buffer_[index], vert_buff[0]);
207 1436 BufferVecTraits::load(&vert_deriv_buffer_[index + channel], vert_buff[1]);
208 2872 BufferVecTraits::load(&vert_deriv_buffer_[index + channel * 2],
209 1436 vert_buff[2]);
210 2872 DestinationVectorType vert_approx_res =
211 1436 horizontal_vector_path_vert_approx(vert_buff);
212
213 1436 vst2q(&dst_rows[index * 2], {hori_approx_res, vert_approx_res});
214 1436 });
215
216 19960 loop.tail([&](ptrdiff_t index) {
217 14200 dst_rows[index * 2] = static_cast<DestinationType>(
218 // For some reason clang-tidy thinks these accesses are invalid
219 // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign,
220 // clang-analyzer-core.UndefinedBinaryOperatorResult)
221 14200 hori_deriv_buffer_[index + channel * 2] - hori_deriv_buffer_[index]);
222 // NOLINTEND(clang-analyzer-core.uninitialized.Assign,
223 // clang-analyzer-core.UndefinedBinaryOperatorResult)
224
225 14200 dst_rows[index * 2 + 1] = static_cast<DestinationType>(
226 42600 (vert_deriv_buffer_[index] +
227 28400 vert_deriv_buffer_[index + channel * 2]) *
228 14200 3 +
229 14200 vert_deriv_buffer_[index + channel] * 10);
230 14200 });
231 5760 }
232
233 Rows<BufferType> hori_deriv_buffer_;
234 Rows<BufferType> vert_deriv_buffer_;
235 size_t width_;
236 int16x8_t const_3_s16_;
237 uint8x16_t const_10_u8_;
238 int16x8_t const_10_s16_;
239
240 static constexpr ptrdiff_t kSourceVecNumLanes =
241 static_cast<ptrdiff_t>(SourceVecTraits::num_lanes());
242 static constexpr ptrdiff_t kBufferVecNumLanes =
243 static_cast<ptrdiff_t>(BufferVecTraits::num_lanes());
244 }; // end of class ScharrInterleaved
245
246 class ScharrBufferDeleter {
247 public:
248 330 void operator()(void *ptr) const { std::free(ptr); }
249 };
250
251 KLEIDICV_TARGET_FN_ATTRS
252 336 kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8(
253 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
254 size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin,
255 size_t y_end) {
256 // Does not include checks for whether the operation is implemented.
257 // This must be done earlier, by scharr_interleaved_is_implemented.
258
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 335 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 335 times.
336 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
259
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 333 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 333 times.
335 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height);
260
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 332 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 331 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 331 times.
333 CHECK_IMAGE_SIZE(src_width, src_height);
261
262 331 size_t buffer_stride = src_width * src_channels * sizeof(int16_t);
263 // Buffer has two rows, one for the horizontal derivative approximation, one
264 // for the vertical one.
265 331 size_t buffer_height = 2;
266 // Memory is allocated with malloc to avoid its initialization.
267 331 void *allocation = std::malloc(buffer_stride * buffer_height);
268
269
2/2
✓ Branch 0 taken 330 times.
✓ Branch 1 taken 1 times.
331 if (!allocation) {
270 1 return KLEIDICV_ERROR_ALLOCATION;
271 }
272
273 660 std::unique_ptr<int16_t, ScharrBufferDeleter> buffer(
274 330 reinterpret_cast<int16_t *>(allocation));
275
276 330 Rows<const uint8_t> src_rows{src, src_stride, src_channels};
277
278 // Result is treated as it has double the channel number compared to the
279 // input.
280 330 Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2};
281
282 330 Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels};
283
284 660 int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>(
285 330 reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride);
286 330 Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels};
287
288 660 ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width)
289 330 .process(src_rows, dst_rows, y_begin, y_end);
290
291 330 return KLEIDICV_OK;
292 336 }
293
294 } // namespace kleidicv::neon
295