KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/scharr_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 106 106 100.0%
Functions: 13 13 100.0%
Branches: 18 18 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cstddef>
6 #include <cstdint>
7 #include <cstdlib>
8 #include <memory>
9
10 #include "kleidicv/config.h"
11 #include "kleidicv/ctypes.h"
12 #include "kleidicv/filters/scharr.h"
13 #include "kleidicv/neon.h"
14 #include "kleidicv/types.h"
15 #include "kleidicv/utils.h"
16
17 namespace kleidicv::neon {
18
19 // Scharr filtering in both horizontal and vertical directions, horizontal and
20 // vertical derivative approximations are stored interleaved.
21 //
22 // The applied weights for the horizontal approximation, as the kernel is
23 // mirrored both vertically and horizontally during the convolution:
24 // [ -3 0 3 ] [ 3 ]
25 // F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ]
26 // [ -3 0 3 ] [ 3 ]
27 //
28 // The applied weights for the vertical approximation, as the kernel is mirrored
29 // both vertically and horizontally during the convolution:
30 // [ -3 -10 -3 ] [ -1 ]
31 // F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ]
32 // [ 3 10 3 ] [ 1 ]
33 //
34 class ScharrInterleaved {
35 using SourceType = uint8_t;
36 using SourceVecTraits = VecTraits<SourceType>;
37 using SourceVectorType = typename SourceVecTraits::VectorType;
38 using BufferType = int16_t;
39 using BufferVecTraits = VecTraits<BufferType>;
40 using BufferVectorType = typename BufferVecTraits::VectorType;
41 using BufferVector4Type = typename BufferVecTraits::Vector4Type;
42 using DestinationType = int16_t;
43 using DestinationVecTraits = VecTraits<DestinationType>;
44 using DestinationVectorType = typename DestinationVecTraits::VectorType;
45
46 public:
47 67 ScharrInterleaved(Rows<int16_t> hori_deriv_buffer,
48 Rows<int16_t> vert_deriv_buffer, size_t width)
49 67 : hori_deriv_buffer_(hori_deriv_buffer),
50 67 vert_deriv_buffer_(vert_deriv_buffer),
51 67 width_(width),
52 67 const_3_s16_(vdupq_n_s16(3)),
53 67 const_10_u8_(vdupq_n_u8(10)),
54 67 const_10_s16_(vdupq_n_s16(10)) {}
55
56 67 void process(Rows<const uint8_t> src_rows, Rows<int16_t> dst_rows,
57 size_t y_begin, size_t y_end) {
58
2/2
✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1462 times.
1529 for (size_t i = y_begin; i < y_end; ++i) {
59 1462 process_vertical(src_rows.at(static_cast<ptrdiff_t>(i)));
60 1462 process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i)));
61 1462 }
62 67 }
63
64 private:
65 151 BufferVector4Type vertical_vector_path(SourceVectorType src[3]) {
66 // Horizontal derivative approximation
67 151 uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2]));
68 302 uint16x8_t hori_acc_h =
69 151 vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2]));
70
71 151 hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_);
72 151 hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_);
73
74 151 hori_acc_l =
75 151 vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_));
76 151 hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_);
77
78 // Vertical derivative approximation
79 151 uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0]));
80 302 uint16x8_t vert_acc_h =
81 151 vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0]));
82
83 151 return {
84 453 vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h),
85 302 vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)};
86 151 }
87
88 1462 void process_vertical(Rows<const uint8_t> src_rows) {
89 1462 LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes};
90
91 1613 loop.unroll_once([&](ptrdiff_t index) {
92 151 SourceVectorType src[3];
93 151 src[0] = vld1q(&src_rows.at(0)[index]);
94 151 src[1] = vld1q(&src_rows.at(1)[index]);
95 151 src[2] = vld1q(&src_rows.at(2)[index]);
96
97 151 BufferVector4Type res = vertical_vector_path(src);
98
99 151 vst1q(&hori_deriv_buffer_[index], res.val[0]);
100 151 vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]);
101 151 vst1q(&vert_deriv_buffer_[index], res.val[2]);
102 151 vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]);
103 151 });
104
105 9405 loop.tail([&](ptrdiff_t index) {
106 7943 hori_deriv_buffer_[index] = static_cast<BufferType>(
107 15886 (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 +
108 7943 src_rows.at(1)[index] * 10);
109
110 7943 vert_deriv_buffer_[index] = static_cast<BufferType>(
111 7943 src_rows.at(2)[index] - src_rows.at(0)[index]);
112 7943 });
113 1462 }
114
115 500 DestinationVectorType horizontal_vector_path_hori_approx(
116 BufferVectorType buff[2]) {
117 500 return vsubq_s16(buff[1], buff[0]);
118 }
119
120 500 DestinationVectorType horizontal_vector_path_vert_approx(
121 BufferVectorType buff[3]) {
122 500 BufferVectorType a = vaddq_u16(buff[0], buff[2]);
123 500 a = vaddq_u16(a, vaddq_u16(a, a));
124 1000 return vmlaq_u16(a, buff[1], const_10_s16_);
125 500 }
126
127 1462 void process_horizontal(Rows<int16_t> dst_rows) {
128 // width is decremented by 2 as the result has less columns.
129 1462 LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(),
130 kBufferVecNumLanes};
131
132 1962 loop.unroll_once([&](ptrdiff_t index) {
133 500 BufferVectorType hori_buff[2];
134 500 hori_buff[0] = vld1q(&hori_deriv_buffer_[index]);
135 500 hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]);
136 1000 DestinationVectorType hori_approx_res =
137 500 horizontal_vector_path_hori_approx(hori_buff);
138
139 500 BufferVectorType vert_buff[3];
140 500 vert_buff[0] = vld1q(&vert_deriv_buffer_[index]);
141 500 vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]);
142 500 vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]);
143 1000 DestinationVectorType vert_approx_res =
144 500 horizontal_vector_path_vert_approx(vert_buff);
145
146 1000 vst1q(&dst_rows.at(0, index)[0],
147 500 vzip1q_s16(hori_approx_res, vert_approx_res));
148 1000 vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()],
149 500 vzip2q_s16(hori_approx_res, vert_approx_res));
150 500 });
151
152 4897 loop.tail([&](ptrdiff_t index) {
153 3435 dst_rows.at(0, index)[0] = static_cast<DestinationType>(
154 // For some reason clang-tidy thinks these accesses are invalid
155 // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign,
156 // clang-analyzer-core.UndefinedBinaryOperatorResult)
157 3435 hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]);
158 // NOLINTEND(clang-analyzer-core.uninitialized.Assign,
159 // clang-analyzer-core.UndefinedBinaryOperatorResult)
160
161 3435 dst_rows.at(0, index)[1] = static_cast<DestinationType>(
162 6870 (vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 +
163 3435 vert_deriv_buffer_[index + 1] * 10);
164 3435 });
165 1462 }
166
167 Rows<int16_t> hori_deriv_buffer_;
168 Rows<int16_t> vert_deriv_buffer_;
169 size_t width_;
170 int16x8_t const_3_s16_;
171 uint8x16_t const_10_u8_;
172 int16x8_t const_10_s16_;
173
174 static constexpr ptrdiff_t kSourceVecNumLanes =
175 static_cast<ptrdiff_t>(SourceVecTraits::num_lanes());
176 static constexpr ptrdiff_t kBufferVecNumLanes =
177 static_cast<ptrdiff_t>(BufferVecTraits::num_lanes());
178 }; // end of class ScharrInterleaved
179
180 class ScharrBufferDeleter {
181 public:
182 67 void operator()(void *ptr) const { std::free(ptr); }
183 };
184
185 KLEIDICV_TARGET_FN_ATTRS
186 73 kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8(
187 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
188 size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin,
189 size_t y_end) {
190 // Does not include checks for whether the operation is implemented.
191 // This must be done earlier, by scharr_interleaved_is_implemented.
192
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 72 times.
73 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
193
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 70 times.
72 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height);
194
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 69 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 68 times.
70 CHECK_IMAGE_SIZE(src_width, src_height);
195
196 68 size_t buffer_stride = src_width * src_channels * sizeof(int16_t);
197 // Buffer has two rows, one for the horizontal derivative approximation, one
198 // for the vertical one.
199 68 size_t buffer_height = 2;
200 // Memory is allocated with malloc to avoid its initialization.
201 68 void *allocation = std::malloc(buffer_stride * buffer_height);
202
203
2/2
✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1 times.
68 if (!allocation) {
204 1 return KLEIDICV_ERROR_ALLOCATION;
205 }
206
207 134 std::unique_ptr<int16_t, ScharrBufferDeleter> buffer(
208 67 reinterpret_cast<int16_t *>(allocation));
209
210 67 Rows<const uint8_t> src_rows{src, src_stride, src_channels};
211
212 // Result is treated as it has double the channel number compared to the
213 // input.
214 67 Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2};
215
216 67 Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels};
217
218 134 int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>(
219 67 reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride);
220 67 Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels};
221
222 134 ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width)
223 67 .process(src_rows, dst_rows, y_begin, y_end);
224
225 67 return KLEIDICV_OK;
226 73 }
227
228 } // namespace kleidicv::neon
229