KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/separable_filter_2d_sc.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 171 171 100.0%
Functions: 20 20 100.0%
Branches: 64 64 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_2D_SC_H
6 #define KLEIDICV_SEPARABLE_FILTER_2D_SC_H
7
8 #include <limits>
9
10 #include "kleidicv/filters/separable_filter_5x5_sc.h"
11 #include "kleidicv/kleidicv.h"
12 #include "kleidicv/sve2.h"
13 #include "kleidicv/workspace/separable.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 template <typename ScalarType, size_t KernelSize>
18 class SeparableFilter2D;
19
20 template <>
21 class SeparableFilter2D<uint8_t, 5> {
22 public:
23 using SourceType = uint8_t;
24 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
25 using BufferType = uint16_t;
26 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
27 using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type;
28 using DestinationType = uint8_t;
29
30 195 SeparableFilter2D(
31 const SourceType *kernel_x, BufferVectorType &kernel_x_0_u16,
32 BufferVectorType &kernel_x_1_u16, BufferVectorType &kernel_x_2_u16,
33 BufferVectorType &kernel_x_3_u16, BufferVectorType &kernel_x_4_u16,
34 SourceVectorType &kernel_y_0_u8, SourceVectorType &kernel_y_1_u8,
35 SourceVectorType &kernel_y_2_u8, SourceVectorType &kernel_y_3_u8,
36 SourceVectorType &kernel_y_4_u8)
37 195 : kernel_x_(kernel_x),
38 195 kernel_x_0_u16_(kernel_x_0_u16),
39 195 kernel_x_1_u16_(kernel_x_1_u16),
40 195 kernel_x_2_u16_(kernel_x_2_u16),
41 195 kernel_x_3_u16_(kernel_x_3_u16),
42 195 kernel_x_4_u16_(kernel_x_4_u16),
43
44 195 kernel_y_0_u8_(kernel_y_0_u8),
45 195 kernel_y_1_u8_(kernel_y_1_u8),
46 195 kernel_y_2_u8_(kernel_y_2_u8),
47 195 kernel_y_3_u8_(kernel_y_3_u8),
48 195 kernel_y_4_u8_(kernel_y_4_u8) {}
49
50 2284 void vertical_vector_path(svbool_t pg,
51 std::reference_wrapper<SourceVectorType> src[5],
52 BufferType *dst) const KLEIDICV_STREAMING {
53 // 0
54 2284 BufferVectorType acc_b = svmullb_u16(src[0], kernel_y_0_u8_);
55 2284 BufferVectorType acc_t = svmullt_u16(src[0], kernel_y_0_u8_);
56
57 // 1
58 2284 BufferVectorType vec_b = svmullb_u16(src[1], kernel_y_1_u8_);
59 2284 BufferVectorType vec_t = svmullt_u16(src[1], kernel_y_1_u8_);
60 2284 acc_b = svqadd_u16_x(pg, acc_b, vec_b);
61 2284 acc_t = svqadd_u16_x(pg, acc_t, vec_t);
62
63 // 2
64 2284 vec_b = svmullb_u16(src[2], kernel_y_2_u8_);
65 2284 vec_t = svmullt_u16(src[2], kernel_y_2_u8_);
66 2284 acc_b = svqadd_u16_x(pg, acc_b, vec_b);
67 2284 acc_t = svqadd_u16_x(pg, acc_t, vec_t);
68
69 // 3
70 2284 vec_b = svmullb_u16(src[3], kernel_y_3_u8_);
71 2284 vec_t = svmullt_u16(src[3], kernel_y_3_u8_);
72 2284 acc_b = svqadd_u16_x(pg, acc_b, vec_b);
73 2284 acc_t = svqadd_u16_x(pg, acc_t, vec_t);
74
75 // 4
76 2284 vec_b = svmullb_u16(src[4], kernel_y_4_u8_);
77 2284 vec_t = svmullt_u16(src[4], kernel_y_4_u8_);
78 2284 acc_b = svqadd_u16_x(pg, acc_b, vec_b);
79 2284 acc_t = svqadd_u16_x(pg, acc_t, vec_t);
80
81 2284 BufferDoubleVectorType interleaved = svcreate2_u16(acc_b, acc_t);
82 2284 svst2(pg, &dst[0], interleaved);
83 2284 }
84
85 1950 void horizontal_vector_path(svbool_t pg,
86 std::reference_wrapper<BufferVectorType> src[5],
87 DestinationType *dst) const KLEIDICV_STREAMING {
88 // 0
89 1950 svuint32_t acc_b = svmullb_u32(src[0], kernel_x_0_u16_);
90 1950 svuint32_t acc_t = svmullt_u32(src[0], kernel_x_0_u16_);
91
92 // 1
93 1950 acc_b = svmlalb_u32(acc_b, src[1], kernel_x_1_u16_);
94 1950 acc_t = svmlalt_u32(acc_t, src[1], kernel_x_1_u16_);
95
96 // 2
97 1950 acc_b = svmlalb_u32(acc_b, src[2], kernel_x_2_u16_);
98 1950 acc_t = svmlalt_u32(acc_t, src[2], kernel_x_2_u16_);
99
100 // 3
101 1950 acc_b = svmlalb_u32(acc_b, src[3], kernel_x_3_u16_);
102 1950 acc_t = svmlalt_u32(acc_t, src[3], kernel_x_3_u16_);
103
104 // 4
105 1950 acc_b = svmlalb_u32(acc_b, src[4], kernel_x_4_u16_);
106 1950 acc_t = svmlalt_u32(acc_t, src[4], kernel_x_4_u16_);
107
108 1950 svuint16_t acc_u16_b = svqxtnb_u32(acc_b);
109 1950 svuint16_t acc_u16 = svqxtnt_u32(acc_u16_b, acc_t);
110
111 3900 svbool_t greater =
112 1950 svcmpgt_n_u16(pg, acc_u16, std::numeric_limits<SourceType>::max());
113 1950 acc_u16 =
114 1950 svdup_n_u16_m(acc_u16, greater, std::numeric_limits<SourceType>::max());
115
116 1950 svst1b_u16(pg, &dst[0], acc_u16);
117 1950 }
118
119 8592 void horizontal_scalar_path(const BufferType src[5],
120 DestinationType *dst) const KLEIDICV_STREAMING {
121 8592 SourceType acc; // NOLINT
122
2/2
✓ Branch 0 taken 6324 times.
✓ Branch 1 taken 2268 times.
8592 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
123 6324 dst[0] = std::numeric_limits<SourceType>::max();
124 6324 return;
125 }
126
127
4/4
✓ Branch 0 taken 8627 times.
✓ Branch 1 taken 1469 times.
✓ Branch 2 taken 799 times.
✓ Branch 3 taken 1469 times.
10895 for (size_t i = 1; i < 5; i++) {
128 8627 SourceType temp; // NOLINT
129
2/2
✓ Branch 0 taken 48 times.
✓ Branch 1 taken 8579 times.
8627 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
130 48 dst[0] = std::numeric_limits<SourceType>::max();
131 48 return;
132 }
133
2/2
✓ Branch 0 taken 751 times.
✓ Branch 1 taken 7828 times.
8579 if (__builtin_add_overflow(acc, temp, &acc)) {
134 751 dst[0] = std::numeric_limits<SourceType>::max();
135 751 return;
136 }
137 8627 }
138
139 1469 dst[0] = acc;
140 8592 }
141
142 private:
143 const SourceType *kernel_x_;
144
145 BufferVectorType &kernel_x_0_u16_;
146 BufferVectorType &kernel_x_1_u16_;
147 BufferVectorType &kernel_x_2_u16_;
148 BufferVectorType &kernel_x_3_u16_;
149 BufferVectorType &kernel_x_4_u16_;
150
151 SourceVectorType &kernel_y_0_u8_;
152 SourceVectorType &kernel_y_1_u8_;
153 SourceVectorType &kernel_y_2_u8_;
154 SourceVectorType &kernel_y_3_u8_;
155 SourceVectorType &kernel_y_4_u8_;
156 }; // end of class SeparableFilter2D<uint8_t, 5>
157
158 template <>
159 class SeparableFilter2D<uint16_t, 5> {
160 public:
161 using SourceType = uint16_t;
162 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
163 using BufferType = uint32_t;
164 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
165 using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type;
166 using DestinationType = uint16_t;
167
168 195 SeparableFilter2D(
169 const SourceType *kernel_x, BufferVectorType &kernel_x_0_u32,
170 BufferVectorType &kernel_x_1_u32, BufferVectorType &kernel_x_2_u32,
171 BufferVectorType &kernel_x_3_u32, BufferVectorType &kernel_x_4_u32,
172 SourceVectorType &kernel_y_0_u16, SourceVectorType &kernel_y_1_u16,
173 SourceVectorType &kernel_y_2_u16, SourceVectorType &kernel_y_3_u16,
174 SourceVectorType &kernel_y_4_u16)
175 195 : kernel_x_(kernel_x),
176 195 kernel_x_0_u32_(kernel_x_0_u32),
177 195 kernel_x_1_u32_(kernel_x_1_u32),
178 195 kernel_x_2_u32_(kernel_x_2_u32),
179 195 kernel_x_3_u32_(kernel_x_3_u32),
180 195 kernel_x_4_u32_(kernel_x_4_u32),
181
182 195 kernel_y_0_u16_(kernel_y_0_u16),
183 195 kernel_y_1_u16_(kernel_y_1_u16),
184 195 kernel_y_2_u16_(kernel_y_2_u16),
185 195 kernel_y_3_u16_(kernel_y_3_u16),
186 195 kernel_y_4_u16_(kernel_y_4_u16) {}
187
188 2909 void vertical_vector_path(svbool_t pg,
189 std::reference_wrapper<SourceVectorType> src[5],
190 BufferType *dst) const KLEIDICV_STREAMING {
191 // 0
192 2909 BufferVectorType acc_b = svmullb_u32(src[0], kernel_y_0_u16_);
193 2909 BufferVectorType acc_t = svmullt_u32(src[0], kernel_y_0_u16_);
194
195 // 1
196 2909 BufferVectorType vec_b = svmullb_u32(src[1], kernel_y_1_u16_);
197 2909 BufferVectorType vec_t = svmullt_u32(src[1], kernel_y_1_u16_);
198 2909 acc_b = svqadd_u32_x(pg, acc_b, vec_b);
199 2909 acc_t = svqadd_u32_x(pg, acc_t, vec_t);
200
201 // 2
202 2909 vec_b = svmullb_u32(src[2], kernel_y_2_u16_);
203 2909 vec_t = svmullt_u32(src[2], kernel_y_2_u16_);
204 2909 acc_b = svqadd_u32_x(pg, acc_b, vec_b);
205 2909 acc_t = svqadd_u32_x(pg, acc_t, vec_t);
206
207 // 3
208 2909 vec_b = svmullb_u32(src[3], kernel_y_3_u16_);
209 2909 vec_t = svmullt_u32(src[3], kernel_y_3_u16_);
210 2909 acc_b = svqadd_u32_x(pg, acc_b, vec_b);
211 2909 acc_t = svqadd_u32_x(pg, acc_t, vec_t);
212
213 // 4
214 2909 vec_b = svmullb_u32(src[4], kernel_y_4_u16_);
215 2909 vec_t = svmullt_u32(src[4], kernel_y_4_u16_);
216 2909 acc_b = svqadd_u32_x(pg, acc_b, vec_b);
217 2909 acc_t = svqadd_u32_x(pg, acc_t, vec_t);
218
219 2909 BufferDoubleVectorType interleaved = svcreate2_u32(acc_b, acc_t);
220 2909 svst2(pg, &dst[0], interleaved);
221 2909 }
222
223 2834 void horizontal_vector_path(svbool_t pg,
224 std::reference_wrapper<BufferVectorType> src[5],
225 DestinationType *dst) const KLEIDICV_STREAMING {
226 // 0
227 2834 svuint64_t acc_b = svmullb_u64(src[0], kernel_x_0_u32_);
228 2834 svuint64_t acc_t = svmullt_u64(src[0], kernel_x_0_u32_);
229
230 // 1
231 2834 acc_b = svmlalb_u64(acc_b, src[1], kernel_x_1_u32_);
232 2834 acc_t = svmlalt_u64(acc_t, src[1], kernel_x_1_u32_);
233
234 // 2
235 2834 acc_b = svmlalb_u64(acc_b, src[2], kernel_x_2_u32_);
236 2834 acc_t = svmlalt_u64(acc_t, src[2], kernel_x_2_u32_);
237
238 // 3
239 2834 acc_b = svmlalb_u64(acc_b, src[3], kernel_x_3_u32_);
240 2834 acc_t = svmlalt_u64(acc_t, src[3], kernel_x_3_u32_);
241
242 // 4
243 2834 acc_b = svmlalb_u64(acc_b, src[4], kernel_x_4_u32_);
244 2834 acc_t = svmlalt_u64(acc_t, src[4], kernel_x_4_u32_);
245
246 2834 svuint32_t acc_u32_b = svqxtnb_u64(acc_b);
247 2834 svuint32_t acc_u32 = svqxtnt_u64(acc_u32_b, acc_t);
248
249 5668 svbool_t greater =
250 2834 svcmpgt_n_u32(pg, acc_u32, std::numeric_limits<SourceType>::max());
251 2834 acc_u32 =
252 2834 svdup_n_u32_m(acc_u32, greater, std::numeric_limits<SourceType>::max());
253
254 2834 svst1h_u32(pg, &dst[0], acc_u32);
255 2834 }
256
257 8700 void horizontal_scalar_path(const BufferType src[5],
258 DestinationType *dst) const KLEIDICV_STREAMING {
259 8700 SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT
260
2/2
✓ Branch 0 taken 6390 times.
✓ Branch 1 taken 2310 times.
8700 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
261 6390 dst[0] = std::numeric_limits<SourceType>::max();
262 6390 return;
263 }
264
265
4/4
✓ Branch 0 taken 9216 times.
✓ Branch 1 taken 2253 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 2253 times.
11526 for (size_t i = 1; i < 5; i++) {
266 9216 SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT
267
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 9210 times.
9216 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
268 6 dst[0] = std::numeric_limits<SourceType>::max();
269 6 return;
270 }
271
2/2
✓ Branch 0 taken 51 times.
✓ Branch 1 taken 9159 times.
9210 if (__builtin_add_overflow(acc, temp, &acc)) {
272 51 dst[0] = std::numeric_limits<SourceType>::max();
273 51 return;
274 }
275 9216 }
276
277 2253 dst[0] = acc;
278 8700 }
279
280 private:
281 const SourceType *kernel_x_;
282
283 BufferVectorType &kernel_x_0_u32_;
284 BufferVectorType &kernel_x_1_u32_;
285 BufferVectorType &kernel_x_2_u32_;
286 BufferVectorType &kernel_x_3_u32_;
287 BufferVectorType &kernel_x_4_u32_;
288
289 SourceVectorType &kernel_y_0_u16_;
290 SourceVectorType &kernel_y_1_u16_;
291 SourceVectorType &kernel_y_2_u16_;
292 SourceVectorType &kernel_y_3_u16_;
293 SourceVectorType &kernel_y_4_u16_;
294 }; // end of class SeparableFilter2D<uint16_t, 5>
295
296 template <typename T>
297 438 kleidicv_error_t separable_filter_2d_stripe_sc(
298 const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
299 size_t height, size_t y_begin, size_t y_end, size_t channels,
300 const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y,
301 size_t /*kernel_height*/,
302 FixedBorderType fixed_border_type) KLEIDICV_STREAMING {
303
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 216 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 216 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 216 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 216 times.
438 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
304
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 213 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 213 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 213 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 213 times.
432 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
305
12/12
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 210 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 207 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 207 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 210 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 207 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 207 times.
426 CHECK_IMAGE_SIZE(width, height);
306
4/4
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 201 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 201 times.
414 CHECK_POINTERS(kernel_x, kernel_y);
307
308
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 198 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 198 times.
402 if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) {
309 6 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
310 }
311
312 396 Rectangle rect{width, height};
313
314 using SeparableFilterClass = SeparableFilter2D<T, 5>;
315 396 constexpr size_t intermediate_size{
316 sizeof(typename SeparableFilterClass::BufferType)};
317
318 396 auto workspace_variant =
319 396 SeparableFilterWorkspace::create(rect, channels, intermediate_size);
320
8/8
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 195 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 195 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 195 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 195 times.
402 if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) {
321 6 return *err;
322 }
323 390 auto &workspace = *std::get_if<SeparableFilterWorkspace>(&workspace_variant);
324
325 using WiderT = typename double_element_width<T>::type;
326 using KernelXVectorTraits = VecTraits<WiderT>;
327 using KernelXVectorT = typename KernelXVectorTraits::VectorType;
328 using KernelYVectorTraits = VecTraits<T>;
329 using KernelYVectorT = typename KernelYVectorTraits::VectorType;
330
331 390 KernelXVectorT kernel_x_0 = KernelXVectorTraits::svdup(kernel_x[0]);
332 390 KernelXVectorT kernel_x_1 = KernelXVectorTraits::svdup(kernel_x[1]);
333 390 KernelXVectorT kernel_x_2 = KernelXVectorTraits::svdup(kernel_x[2]);
334 390 KernelXVectorT kernel_x_3 = KernelXVectorTraits::svdup(kernel_x[3]);
335 390 KernelXVectorT kernel_x_4 = KernelXVectorTraits::svdup(kernel_x[4]);
336
337 390 KernelYVectorT kernel_y_0 = KernelYVectorTraits::svdup(kernel_y[0]);
338 390 KernelYVectorT kernel_y_1 = KernelYVectorTraits::svdup(kernel_y[1]);
339 390 KernelYVectorT kernel_y_2 = KernelYVectorTraits::svdup(kernel_y[2]);
340 390 KernelYVectorT kernel_y_3 = KernelYVectorTraits::svdup(kernel_y[3]);
341 390 KernelYVectorT kernel_y_4 = KernelYVectorTraits::svdup(kernel_y[4]);
342
343 780 SeparableFilterClass filterClass{
344 390 kernel_x, kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4,
345 kernel_y_0, kernel_y_1, kernel_y_2, kernel_y_3, kernel_y_4};
346 390 SeparableFilter<SeparableFilterClass, 5> filter{filterClass};
347
348 390 Rows<const T> src_rows{src, src_stride, channels};
349 390 Rows<T> dst_rows{dst, dst_stride, channels};
350 780 workspace.process(y_begin, y_end, src_rows, dst_rows, fixed_border_type,
351 390 filter);
352
353 390 return KLEIDICV_OK;
354 438 }
355
356 } // namespace KLEIDICV_TARGET_NAMESPACE
357
358 #endif // KLEIDICV_SEPARABLE_FILTER_2D_SC_H
359