KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/separable_filter_2d_neon.cpp
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 142 142 100.0%
Functions: 12 12 100.0%
Branches: 88 88 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <limits>
6
7 #include "kleidicv/ctypes.h"
8 #include "kleidicv/filters/separable_filter_2d.h"
9 #include "kleidicv/filters/separable_filter_5x5_neon.h"
10 #include "kleidicv/kleidicv.h"
11 #include "kleidicv/neon.h"
12 #include "kleidicv/workspace/separable.h"
13
14 namespace kleidicv::neon {
15
16 template <typename ScalarType, size_t KernelSize>
17 class SeparableFilter2D;
18
19 template <>
20 class SeparableFilter2D<uint8_t, 5> {
21 public:
22 using SourceType = uint8_t;
23 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
24 using BufferType = uint16_t;
25 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
26 using DestinationType = uint8_t;
27
28 // Ignored because vectors are initialized in the constructor body.
29 // NOLINTNEXTLINE - hicpp-member-init
30 65 SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
31 65 : kernel_x_(kernel_x), kernel_y_(kernel_y) {
32
2/2
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 325 times.
390 for (size_t i = 0; i < 5; i++) {
33 325 kernel_x_u16_[i] = vdupq_n_u16(kernel_x[i]);
34 325 kernel_y_u8_[i] = vdupq_n_u8(kernel_y[i]);
35 325 }
36 65 }
37
38 300 void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const {
39 600 BufferVectorType acc_l =
40 300 vmull_u8(vget_low_u8(src[0]), vget_low_u8(kernel_y_u8_[0]));
41 300 BufferVectorType acc_h = vmull_high_u8(src[0], kernel_y_u8_[0]);
42
43 // Optimization to avoid unnecessary branching in vector code.
44 KLEIDICV_FORCE_LOOP_UNROLL
45
2/2
✓ Branch 0 taken 300 times.
✓ Branch 1 taken 1200 times.
1500 for (size_t i = 1; i < 5; i++) {
46 2400 BufferVectorType vec_l =
47 1200 vmull_u8(vget_low_u8(src[i]), vget_low_u8(kernel_y_u8_[i]));
48 1200 BufferVectorType vec_h = vmull_high_u8(src[i], kernel_y_u8_[i]);
49
50 1200 acc_l = vqaddq_u16(acc_l, vec_l);
51 1200 acc_h = vqaddq_u16(acc_h, vec_h);
52 1200 }
53
54 300 vst1q_u16(&dst[0], acc_l);
55 300 vst1q_u16(&dst[8], acc_h);
56 300 }
57
58 4026 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
59 4026 BufferType acc = static_cast<BufferType>(src[0]) * kernel_y_[0];
60
4/4
✓ Branch 0 taken 15875 times.
✓ Branch 1 taken 3914 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 3914 times.
19901 for (size_t i = 1; i < 5; i++) {
61 15875 BufferType temp = static_cast<BufferType>(src[i]) * kernel_y_[i];
62
2/2
✓ Branch 0 taken 112 times.
✓ Branch 1 taken 15763 times.
15875 if (__builtin_add_overflow(acc, temp, &acc)) {
63 112 dst[0] = std::numeric_limits<SourceType>::max();
64 112 return;
65 }
66 15875 }
67
68 3914 dst[0] = acc;
69 4026 }
70
71 628 void horizontal_vector_path(BufferVectorType src[5],
72 DestinationType *dst) const {
73 1256 uint32x4_t acc_l =
74 628 vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_x_u16_[0]));
75 628 uint32x4_t acc_h = vmull_high_u16(src[0], kernel_x_u16_[0]);
76
77 // Optimization to avoid unnecessary branching in vector code.
78 KLEIDICV_FORCE_LOOP_UNROLL
79
2/2
✓ Branch 0 taken 628 times.
✓ Branch 1 taken 2512 times.
3140 for (size_t i = 1; i < 5; i++) {
80 5024 acc_l = vmlal_u16(acc_l, vget_low_u16(src[i]),
81 2512 vget_low_u16(kernel_x_u16_[i]));
82 2512 acc_h = vmlal_high_u16(acc_h, src[i], kernel_x_u16_[i]);
83 2512 }
84
85 628 uint16x8_t acc_u16 = vcombine_u16(vqmovn_u32(acc_l), vqmovn_u32(acc_h));
86 628 uint8x8_t result = vqmovn_u16(acc_u16);
87 628 vst1_u8(&dst[0], result);
88 628 }
89
90 3166 void horizontal_scalar_path(const BufferType src[5],
91 DestinationType *dst) const {
92 3166 SourceType acc; // NOLINT
93
2/2
✓ Branch 0 taken 2385 times.
✓ Branch 1 taken 781 times.
3166 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
94 2385 dst[0] = std::numeric_limits<SourceType>::max();
95 2385 return;
96 }
97
98
4/4
✓ Branch 0 taken 2934 times.
✓ Branch 1 taken 375 times.
✓ Branch 2 taken 406 times.
✓ Branch 3 taken 375 times.
3715 for (size_t i = 1; i < 5; i++) {
99 2934 SourceType temp; // NOLINT
100
2/2
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 2914 times.
2934 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
101 20 dst[0] = std::numeric_limits<SourceType>::max();
102 20 return;
103 }
104
2/2
✓ Branch 0 taken 386 times.
✓ Branch 1 taken 2528 times.
2914 if (__builtin_add_overflow(acc, temp, &acc)) {
105 386 dst[0] = std::numeric_limits<SourceType>::max();
106 386 return;
107 }
108 2934 }
109
110 375 dst[0] = acc;
111 3166 }
112
113 private:
114 const SourceType *kernel_x_;
115 const SourceType *kernel_y_;
116
117 BufferVectorType kernel_x_u16_[5];
118 SourceVectorType kernel_y_u8_[5];
119 }; // end of class SeparableFilter2D<uint8_t, 5>
120
121 template <>
122 class SeparableFilter2D<uint16_t, 5> {
123 public:
124 using SourceType = uint16_t;
125 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
126 using BufferType = uint32_t;
127 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
128 using DestinationType = uint16_t;
129
130 // Ignored because vectors are initialized in the constructor body.
131 // NOLINTNEXTLINE - hicpp-member-init
132 65 SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
133 65 : kernel_x_(kernel_x), kernel_y_(kernel_y) {
134
2/2
✓ Branch 0 taken 65 times.
✓ Branch 1 taken 325 times.
390 for (size_t i = 0; i < 5; i++) {
135 325 kernel_x_u32_[i] = vdupq_n_u32(kernel_x[i]);
136 325 kernel_y_u16_[i] = vdupq_n_u16(kernel_y[i]);
137 325 }
138 65 }
139
140 857 void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const {
141 1714 BufferVectorType acc_l =
142 857 vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_y_u16_[0]));
143 857 BufferVectorType acc_h = vmull_high_u16(src[0], kernel_y_u16_[0]);
144
145 // Optimization to avoid unnecessary branching in vector code.
146 KLEIDICV_FORCE_LOOP_UNROLL
147
2/2
✓ Branch 0 taken 857 times.
✓ Branch 1 taken 3428 times.
4285 for (size_t i = 1; i < 5; i++) {
148 6856 BufferVectorType vec_l =
149 3428 vmull_u16(vget_low_u16(src[i]), vget_low_u16(kernel_y_u16_[i]));
150 3428 BufferVectorType vec_h = vmull_high_u16(src[i], kernel_y_u16_[i]);
151
152 3428 acc_l = vqaddq_u32(acc_l, vec_l);
153 3428 acc_h = vqaddq_u32(acc_h, vec_h);
154 3428 }
155
156 857 vst1q_u32(&dst[0], acc_l);
157 857 vst1q_u32(&dst[4], acc_h);
158 857 }
159
160 1844 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
161 1844 BufferType acc = static_cast<BufferType>(src[0]) * kernel_y_[0];
162
4/4
✓ Branch 0 taken 7208 times.
✓ Branch 1 taken 1788 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 1788 times.
9052 for (size_t i = 1; i < 5; i++) {
163 7208 BufferType temp = static_cast<BufferType>(src[i]) * kernel_y_[i];
164
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 7152 times.
7208 if (__builtin_add_overflow(acc, temp, &acc)) {
165 56 dst[0] = std::numeric_limits<SourceType>::max();
166 56 return;
167 }
168 7208 }
169
170 1788 dst[0] = acc;
171 1844 }
172
173 1152 void horizontal_vector_path(BufferVectorType src[5],
174 DestinationType *dst) const {
175 2304 uint64x2_t acc_l =
176 1152 vmull_u32(vget_low_u32(src[0]), vget_low_u32(kernel_x_u32_[0]));
177 1152 uint64x2_t acc_h = vmull_high_u32(src[0], kernel_x_u32_[0]);
178
179 // Optimization to avoid unnecessary branching in vector code.
180 KLEIDICV_FORCE_LOOP_UNROLL
181
2/2
✓ Branch 0 taken 1152 times.
✓ Branch 1 taken 4608 times.
5760 for (size_t i = 1; i < 5; i++) {
182 9216 acc_l = vmlal_u32(acc_l, vget_low_u32(src[i]),
183 4608 vget_low_u32(kernel_x_u32_[i]));
184 4608 acc_h = vmlal_high_u32(acc_h, src[i], kernel_x_u32_[i]);
185 4608 }
186
187 1152 uint32x4_t acc_u32 = vcombine_u32(vqmovn_u64(acc_l), vqmovn_u64(acc_h));
188 1152 uint16x4_t result = vqmovn_u32(acc_u32);
189 1152 vst1_u16(&dst[0], result);
190 1152 }
191
192 3264 void horizontal_scalar_path(const BufferType src[5],
193 DestinationType *dst) const {
194 3264 SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT
195
2/2
✓ Branch 0 taken 2449 times.
✓ Branch 1 taken 815 times.
3264 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
196 2449 dst[0] = std::numeric_limits<SourceType>::max();
197 2449 return;
198 }
199
200
4/4
✓ Branch 0 taken 3249 times.
✓ Branch 1 taken 785 times.
✓ Branch 2 taken 30 times.
✓ Branch 3 taken 785 times.
4064 for (size_t i = 1; i < 5; i++) {
201 3249 SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT
202
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3246 times.
3249 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
203 3 dst[0] = std::numeric_limits<SourceType>::max();
204 3 return;
205 }
206
2/2
✓ Branch 0 taken 27 times.
✓ Branch 1 taken 3219 times.
3246 if (__builtin_add_overflow(acc, temp, &acc)) {
207 27 dst[0] = std::numeric_limits<SourceType>::max();
208 27 return;
209 }
210 3249 }
211
212 785 dst[0] = acc;
213 3264 }
214
215 private:
216 const SourceType *kernel_x_;
217 const SourceType *kernel_y_;
218
219 BufferVectorType kernel_x_u32_[5];
220 SourceVectorType kernel_y_u16_[5];
221 }; // end of class SeparableFilter2D<uint16_t, 5>
222
223 template <typename T>
224 146 kleidicv_error_t separable_filter_2d_stripe(
225 const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
226 size_t height, size_t y_begin, size_t y_end, size_t channels,
227 const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y,
228 size_t /*kernel_height*/, FixedBorderType fixed_border_type) {
229
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 72 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 72 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 72 times.
146 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
230
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 71 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 71 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 71 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 71 times.
144 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
231
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 69 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 69 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 70 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 69 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 69 times.
142 CHECK_IMAGE_SIZE(width, height);
232
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 67 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 67 times.
138 CHECK_POINTERS(kernel_x, kernel_y);
233
234
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 66 times.
134 if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) {
235 2 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
236 }
237
238 132 Rectangle rect{width, height};
239
240 using SeparableFilterClass = SeparableFilter2D<T, 5>;
241 132 constexpr size_t intermediate_size{
242 sizeof(typename SeparableFilterClass::BufferType)};
243
244 132 auto workspace_variant =
245 132 SeparableFilterWorkspace::create(rect, channels, intermediate_size);
246
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 65 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 65 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 65 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 65 times.
134 if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) {
247 2 return *err;
248 }
249 130 auto &workspace = *std::get_if<SeparableFilterWorkspace>(&workspace_variant);
250
251 130 SeparableFilterClass filterClass{kernel_x, kernel_y};
252 130 SeparableFilter<SeparableFilterClass, 5> filter{filterClass};
253
254 130 Rows<const T> src_rows{src, src_stride, channels};
255 130 Rows<T> dst_rows{dst, dst_stride, channels};
256 260 workspace.process(y_begin, y_end, src_rows, dst_rows, fixed_border_type,
257 130 filter);
258
259 130 return KLEIDICV_OK;
260 146 }
261
262 #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \
263 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \
264 separable_filter_2d_stripe<type>( \
265 const type *src, size_t src_stride, type *dst, size_t dst_stride, \
266 size_t width, size_t height, size_t y_begin, size_t y_end, \
267 size_t channels, const type *kernel_x, size_t kernel_width, \
268 const type *kernel_y, size_t kernel_height, FixedBorderType border_type)
269
270 KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t);
271 KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t);
272
273 } // namespace kleidicv::neon
274