KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/separable_filter_2d_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 206 206 100.0%
Functions: 21 21 100.0%
Branches: 140 140 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <limits>
6
7 #include "kleidicv/ctypes.h"
8 #include "kleidicv/filters/separable_filter_2d.h"
9 #include "kleidicv/filters/separable_filter_5x5_neon.h"
10 #include "kleidicv/kleidicv.h"
11 #include "kleidicv/neon.h"
12 #include "kleidicv/workspace/separable.h"
13
14 namespace kleidicv::neon {
15
16 template <typename ScalarType, size_t KernelSize>
17 class SeparableFilter2D;
18
19 template <>
20 class SeparableFilter2D<uint8_t, 5> {
21 public:
22 using SourceType = uint8_t;
23 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
24 using BufferType = uint16_t;
25 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
26 using DestinationType = uint8_t;
27
28 // Ignored because vectors are initialized in the constructor body.
29 // NOLINTNEXTLINE - hicpp-member-init
30 55 SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
31 55 : kernel_x_(kernel_x), kernel_y_(kernel_y) {
32
2/2
✓ Branch 0 taken 55 times.
✓ Branch 1 taken 275 times.
330 for (size_t i = 0; i < 5; i++) {
33 275 kernel_x_u16_[i] = vdupq_n_u16(kernel_x[i]);
34 275 kernel_y_u8_[i] = vdupq_n_u8(kernel_y[i]);
35 275 }
36 55 }
37
38 300 void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const {
39 600 BufferVectorType acc_l =
40 300 vmull_u8(vget_low_u8(src[0]), vget_low_u8(kernel_y_u8_[0]));
41 300 BufferVectorType acc_h = vmull_high_u8(src[0], kernel_y_u8_[0]);
42
43 // Optimization to avoid unnecessary branching in vector code.
44 KLEIDICV_FORCE_LOOP_UNROLL
45
2/2
✓ Branch 0 taken 300 times.
✓ Branch 1 taken 1200 times.
1500 for (size_t i = 1; i < 5; i++) {
46 2400 BufferVectorType vec_l =
47 1200 vmull_u8(vget_low_u8(src[i]), vget_low_u8(kernel_y_u8_[i]));
48 1200 BufferVectorType vec_h = vmull_high_u8(src[i], kernel_y_u8_[i]);
49
50 1200 acc_l = vqaddq_u16(acc_l, vec_l);
51 1200 acc_h = vqaddq_u16(acc_h, vec_h);
52 1200 }
53
54 300 vst1q_u16(&dst[0], acc_l);
55 300 vst1q_u16(&dst[8], acc_h);
56 300 }
57
58 4026 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
59 4026 BufferType acc = static_cast<BufferType>(src[0]) * kernel_y_[0];
60
4/4
✓ Branch 0 taken 15875 times.
✓ Branch 1 taken 3914 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 3914 times.
19901 for (size_t i = 1; i < 5; i++) {
61 15875 BufferType temp = static_cast<BufferType>(src[i]) * kernel_y_[i];
62
2/2
✓ Branch 0 taken 112 times.
✓ Branch 1 taken 15763 times.
15875 if (__builtin_add_overflow(acc, temp, &acc)) {
63 112 dst[0] = std::numeric_limits<SourceType>::max();
64 112 return;
65 }
66 15875 }
67
68 3914 dst[0] = acc;
69 4026 }
70
71 628 void horizontal_vector_path(BufferVectorType src[5],
72 DestinationType *dst) const {
73 1256 uint32x4_t acc_l =
74 628 vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_x_u16_[0]));
75 628 uint32x4_t acc_h = vmull_high_u16(src[0], kernel_x_u16_[0]);
76
77 // Optimization to avoid unnecessary branching in vector code.
78 KLEIDICV_FORCE_LOOP_UNROLL
79
2/2
✓ Branch 0 taken 628 times.
✓ Branch 1 taken 2512 times.
3140 for (size_t i = 1; i < 5; i++) {
80 5024 acc_l = vmlal_u16(acc_l, vget_low_u16(src[i]),
81 2512 vget_low_u16(kernel_x_u16_[i]));
82 2512 acc_h = vmlal_high_u16(acc_h, src[i], kernel_x_u16_[i]);
83 2512 }
84
85 628 uint16x8_t acc_u16 = vcombine_u16(vqmovn_u32(acc_l), vqmovn_u32(acc_h));
86 628 uint8x8_t result = vqmovn_u16(acc_u16);
87 628 vst1_u8(&dst[0], result);
88 628 }
89
90 3166 void horizontal_scalar_path(const BufferType src[5],
91 DestinationType *dst) const {
92 3166 SourceType acc; // NOLINT
93
2/2
✓ Branch 0 taken 2385 times.
✓ Branch 1 taken 781 times.
3166 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
94 2385 dst[0] = std::numeric_limits<SourceType>::max();
95 2385 return;
96 }
97
98
4/4
✓ Branch 0 taken 2996 times.
✓ Branch 1 taken 503 times.
✓ Branch 2 taken 278 times.
✓ Branch 3 taken 503 times.
3777 for (size_t i = 1; i < 5; i++) {
99 2996 SourceType temp; // NOLINT
100
2/2
✓ Branch 0 taken 20 times.
✓ Branch 1 taken 2976 times.
2996 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
101 20 dst[0] = std::numeric_limits<SourceType>::max();
102 20 return;
103 }
104
2/2
✓ Branch 0 taken 258 times.
✓ Branch 1 taken 2718 times.
2976 if (__builtin_add_overflow(acc, temp, &acc)) {
105 258 dst[0] = std::numeric_limits<SourceType>::max();
106 258 return;
107 }
108 2996 }
109
110 503 dst[0] = acc;
111 3166 }
112
113 private:
114 const SourceType *kernel_x_;
115 const SourceType *kernel_y_;
116
117 BufferVectorType kernel_x_u16_[5];
118 SourceVectorType kernel_y_u8_[5];
119 }; // end of class SeparableFilter2D<uint8_t, 5>
120
121 template <>
122 class SeparableFilter2D<uint16_t, 5> {
123 public:
124 using SourceType = uint16_t;
125 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
126 using BufferType = uint32_t;
127 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
128 using DestinationType = uint16_t;
129
130 // Ignored because vectors are initialized in the constructor body.
131 // NOLINTNEXTLINE - hicpp-member-init
132 55 SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
133 55 : kernel_x_(kernel_x), kernel_y_(kernel_y) {
134
2/2
✓ Branch 0 taken 55 times.
✓ Branch 1 taken 275 times.
330 for (size_t i = 0; i < 5; i++) {
135 275 kernel_x_u32_[i] = vdupq_n_u32(kernel_x[i]);
136 275 kernel_y_u16_[i] = vdupq_n_u16(kernel_y[i]);
137 275 }
138 55 }
139
140 857 void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const {
141 1714 BufferVectorType acc_l =
142 857 vmull_u16(vget_low_u16(src[0]), vget_low_u16(kernel_y_u16_[0]));
143 857 BufferVectorType acc_h = vmull_high_u16(src[0], kernel_y_u16_[0]);
144
145 // Optimization to avoid unnecessary branching in vector code.
146 KLEIDICV_FORCE_LOOP_UNROLL
147
2/2
✓ Branch 0 taken 857 times.
✓ Branch 1 taken 3428 times.
4285 for (size_t i = 1; i < 5; i++) {
148 6856 BufferVectorType vec_l =
149 3428 vmull_u16(vget_low_u16(src[i]), vget_low_u16(kernel_y_u16_[i]));
150 3428 BufferVectorType vec_h = vmull_high_u16(src[i], kernel_y_u16_[i]);
151
152 3428 acc_l = vqaddq_u32(acc_l, vec_l);
153 3428 acc_h = vqaddq_u32(acc_h, vec_h);
154 3428 }
155
156 857 vst1q_u32(&dst[0], acc_l);
157 857 vst1q_u32(&dst[4], acc_h);
158 857 }
159
160 1844 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
161 1844 BufferType acc = static_cast<BufferType>(src[0]) * kernel_y_[0];
162
4/4
✓ Branch 0 taken 7208 times.
✓ Branch 1 taken 1788 times.
✓ Branch 2 taken 56 times.
✓ Branch 3 taken 1788 times.
9052 for (size_t i = 1; i < 5; i++) {
163 7208 BufferType temp = static_cast<BufferType>(src[i]) * kernel_y_[i];
164
2/2
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 7152 times.
7208 if (__builtin_add_overflow(acc, temp, &acc)) {
165 56 dst[0] = std::numeric_limits<SourceType>::max();
166 56 return;
167 }
168 7208 }
169
170 1788 dst[0] = acc;
171 1844 }
172
173 1152 void horizontal_vector_path(BufferVectorType src[5],
174 DestinationType *dst) const {
175 2304 uint64x2_t acc_l =
176 1152 vmull_u32(vget_low_u32(src[0]), vget_low_u32(kernel_x_u32_[0]));
177 1152 uint64x2_t acc_h = vmull_high_u32(src[0], kernel_x_u32_[0]);
178
179 // Optimization to avoid unnecessary branching in vector code.
180 KLEIDICV_FORCE_LOOP_UNROLL
181
2/2
✓ Branch 0 taken 1152 times.
✓ Branch 1 taken 4608 times.
5760 for (size_t i = 1; i < 5; i++) {
182 9216 acc_l = vmlal_u32(acc_l, vget_low_u32(src[i]),
183 4608 vget_low_u32(kernel_x_u32_[i]));
184 4608 acc_h = vmlal_high_u32(acc_h, src[i], kernel_x_u32_[i]);
185 4608 }
186
187 1152 uint32x4_t acc_u32 = vcombine_u32(vqmovn_u64(acc_l), vqmovn_u64(acc_h));
188 1152 uint16x4_t result = vqmovn_u32(acc_u32);
189 1152 vst1_u16(&dst[0], result);
190 1152 }
191
192 3264 void horizontal_scalar_path(const BufferType src[5],
193 DestinationType *dst) const {
194 3264 SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT
195
2/2
✓ Branch 0 taken 2449 times.
✓ Branch 1 taken 815 times.
3264 if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) {
196 2449 dst[0] = std::numeric_limits<SourceType>::max();
197 2449 return;
198 }
199
200
4/4
✓ Branch 0 taken 3249 times.
✓ Branch 1 taken 785 times.
✓ Branch 2 taken 30 times.
✓ Branch 3 taken 785 times.
4064 for (size_t i = 1; i < 5; i++) {
201 3249 SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT
202
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 3246 times.
3249 if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) {
203 3 dst[0] = std::numeric_limits<SourceType>::max();
204 3 return;
205 }
206
2/2
✓ Branch 0 taken 27 times.
✓ Branch 1 taken 3219 times.
3246 if (__builtin_add_overflow(acc, temp, &acc)) {
207 27 dst[0] = std::numeric_limits<SourceType>::max();
208 27 return;
209 }
210 3249 }
211
212 785 dst[0] = acc;
213 3264 }
214
215 private:
216 const SourceType *kernel_x_;
217 const SourceType *kernel_y_;
218
219 BufferVectorType kernel_x_u32_[5];
220 SourceVectorType kernel_y_u16_[5];
221 }; // end of class SeparableFilter2D<uint16_t, 5>
222
223 template <>
224 class SeparableFilter2D<int16_t, 5> {
225 public:
226 using SourceType = int16_t;
227 using SourceVectorType = typename VecTraits<SourceType>::VectorType;
228 using BufferType = int32_t;
229 using BufferVectorType = typename VecTraits<BufferType>::VectorType;
230 using DestinationType = int16_t;
231
232 // Ignored because vectors are initialized in the constructor body.
233 // NOLINTNEXTLINE - hicpp-member-init
234 54 SeparableFilter2D(const SourceType *kernel_x, const SourceType *kernel_y)
235 54 : kernel_x_(kernel_x), kernel_y_(kernel_y) {
236
2/2
✓ Branch 0 taken 54 times.
✓ Branch 1 taken 270 times.
324 for (size_t i = 0; i < 5; i++) {
237 270 kernel_x_s32_[i] = vdupq_n_s32(kernel_x[i]);
238 270 kernel_y_s16_[i] = vdupq_n_s16(kernel_y[i]);
239 270 }
240 54 }
241
242 873 void vertical_vector_path(SourceVectorType src[5], BufferType *dst) const {
243 1746 BufferVectorType acc_l =
244 873 vmull_s16(vget_low_s16(src[0]), vget_low_s16(kernel_y_s16_[0]));
245 873 BufferVectorType acc_h = vmull_high_s16(src[0], kernel_y_s16_[0]);
246
247 // Optimization to avoid unnecessary branching in vector code.
248 KLEIDICV_FORCE_LOOP_UNROLL
249
2/2
✓ Branch 0 taken 873 times.
✓ Branch 1 taken 3492 times.
4365 for (size_t i = 1; i < 5; i++) {
250 6984 BufferVectorType vec_l =
251 3492 vmull_s16(vget_low_s16(src[i]), vget_low_s16(kernel_y_s16_[i]));
252 3492 BufferVectorType vec_h = vmull_high_s16(src[i], kernel_y_s16_[i]);
253
254 3492 acc_l = vqaddq_s32(acc_l, vec_l);
255 3492 acc_h = vqaddq_s32(acc_h, vec_h);
256 3492 }
257
258 873 vst1q_s32(&dst[0], acc_l);
259 873 vst1q_s32(&dst[4], acc_h);
260 873 }
261
262 1701 void vertical_scalar_path(const SourceType src[5], BufferType *dst) const {
263 1701 BufferType acc = static_cast<BufferType>(src[0]) * kernel_y_[0];
264
4/4
✓ Branch 0 taken 6754 times.
✓ Branch 1 taken 1676 times.
✓ Branch 2 taken 25 times.
✓ Branch 3 taken 1676 times.
8455 for (size_t i = 1; i < 5; i++) {
265 6754 BufferType temp = static_cast<BufferType>(src[i]) * kernel_y_[i];
266
2/2
✓ Branch 0 taken 25 times.
✓ Branch 1 taken 6729 times.
6754 if (__builtin_add_overflow(acc, temp, &acc)) {
267 25 dst[0] = std::numeric_limits<SourceType>::max();
268 25 return;
269 }
270 6754 }
271
272 1676 dst[0] = acc;
273 1701 }
274
275 1177 void horizontal_vector_path(BufferVectorType src[5],
276 DestinationType *dst) const {
277 2354 int64x2_t acc_l =
278 1177 vmull_s32(vget_low_s32(src[0]), vget_low_s32(kernel_x_s32_[0]));
279 1177 int64x2_t acc_h = vmull_high_s32(src[0], kernel_x_s32_[0]);
280
281 // Optimization to avoid unnecessary branching in vector code.
282 KLEIDICV_FORCE_LOOP_UNROLL
283
2/2
✓ Branch 0 taken 1177 times.
✓ Branch 1 taken 4708 times.
5885 for (size_t i = 1; i < 5; i++) {
284 9416 acc_l = vmlal_s32(acc_l, vget_low_s32(src[i]),
285 4708 vget_low_s32(kernel_x_s32_[i]));
286 4708 acc_h = vmlal_high_s32(acc_h, src[i], kernel_x_s32_[i]);
287 4708 }
288
289 1177 int32x4_t acc_s32 = vcombine_s32(vqmovn_s64(acc_l), vqmovn_s64(acc_h));
290 1177 int16x4_t result = vqmovn_s32(acc_s32);
291 1177 vst1_s16(&dst[0], result);
292 1177 }
293
294 3177 void horizontal_scalar_path(const BufferType src[5],
295 DestinationType *dst) const {
296 3177 int64_t acc = static_cast<int64_t>(src[0]) * kernel_x_[0];
297
2/2
✓ Branch 0 taken 12708 times.
✓ Branch 1 taken 3177 times.
15885 for (size_t i = 1; i < 5; i++) {
298 12708 acc += static_cast<int64_t>(src[i]) * kernel_x_[i];
299 12708 }
300
301
2/2
✓ Branch 0 taken 1016 times.
✓ Branch 1 taken 2161 times.
3177 if (acc < std::numeric_limits<DestinationType>::min()) {
302 1016 acc = std::numeric_limits<DestinationType>::min();
303
2/2
✓ Branch 0 taken 823 times.
✓ Branch 1 taken 1338 times.
3177 } else if (acc > std::numeric_limits<DestinationType>::max()) {
304 1338 acc = std::numeric_limits<DestinationType>::max();
305 1338 }
306
307 3177 dst[0] = static_cast<DestinationType>(acc);
308 3177 }
309
310 private:
311 const SourceType *kernel_x_;
312 const SourceType *kernel_y_;
313
314 BufferVectorType kernel_x_s32_[5];
315 SourceVectorType kernel_y_s16_[5];
316 }; // end of class SeparableFilter2D<int16_t, 5>
317
318 template <typename T>
319 200 static kleidicv_error_t separable_filter_2d_checks(
320 const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
321 size_t height, size_t channels, const T *kernel_x, const T *kernel_y,
322 SeparableFilterWorkspace *workspace) {
323
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 64 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 64 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 63 times.
200 CHECK_POINTERS(workspace, kernel_x, kernel_y);
324
325
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 63 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 63 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 63 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 63 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 62 times.
✓ Branch 10 taken 1 times.
✓ Branch 11 taken 62 times.
191 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
326
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 62 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 62 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 62 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 62 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 61 times.
✓ Branch 10 taken 1 times.
✓ Branch 11 taken 61 times.
188 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
327
18/18
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 61 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 60 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 60 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 61 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 60 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 60 times.
✓ Branch 12 taken 1 times.
✓ Branch 13 taken 60 times.
✓ Branch 14 taken 1 times.
✓ Branch 15 taken 59 times.
✓ Branch 16 taken 2 times.
✓ Branch 17 taken 59 times.
185 CHECK_IMAGE_SIZE(width, height);
328
329
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 59 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 59 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 58 times.
179 if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) {
330 3 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
331 }
332
333
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 58 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 58 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 57 times.
176 if (workspace->channels() < channels) {
334 3 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
335 }
336
337 173 const Rectangle &context_rect = workspace->image_size();
338
12/12
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 55 times.
✓ Branch 4 taken 56 times.
✓ Branch 5 taken 2 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 55 times.
✓ Branch 8 taken 55 times.
✓ Branch 9 taken 2 times.
✓ Branch 10 taken 1 times.
✓ Branch 11 taken 54 times.
173 if (context_rect.width() < width || context_rect.height() < height) {
339 9 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
340 }
341
342 164 return KLEIDICV_OK;
343 200 }
344
345 template <typename T>
346 200 kleidicv_error_t separable_filter_2d_stripe(
347 const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width,
348 size_t height, size_t y_begin, size_t y_end, size_t channels,
349 const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y,
350 size_t /*kernel_height*/, FixedBorderType fixed_border_type,
351 kleidicv_filter_context_t *context) {
352 200 auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
353 400 kleidicv_error_t checks_result = separable_filter_2d_checks(
354 200 src, src_stride, dst, dst_stride, width, height, channels, kernel_x,
355 200 kernel_y, workspace);
356
357
6/6
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 55 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 55 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 54 times.
200 if (checks_result != KLEIDICV_OK) {
358 36 return checks_result;
359 }
360
361 164 Rectangle rect{width, height};
362
363 using SeparableFilterClass = SeparableFilter2D<T, 5>;
364
365 164 SeparableFilterClass filterClass{kernel_x, kernel_y};
366 164 SeparableFilter<SeparableFilterClass, 5> filter{filterClass};
367
368 164 Rows<const T> src_rows{src, src_stride, channels};
369 164 Rows<T> dst_rows{dst, dst_stride, channels};
370 328 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
371 164 fixed_border_type, filter);
372
373 164 return KLEIDICV_OK;
374 200 }
375
376 #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \
377 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \
378 separable_filter_2d_stripe<type>( \
379 const type *src, size_t src_stride, type *dst, size_t dst_stride, \
380 size_t width, size_t height, size_t y_begin, size_t y_end, \
381 size_t channels, const type *kernel_x, size_t kernel_width, \
382 const type *kernel_y, size_t kernel_height, FixedBorderType border_type, \
383 kleidicv_filter_context_t *context)
384
385 KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t);
386 KLEIDICV_INSTANTIATE_TEMPLATE(uint16_t);
387 KLEIDICV_INSTANTIATE_TEMPLATE(int16_t);
388
389 } // namespace kleidicv::neon
390