KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/scale_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 202 202 100.0%
Functions: 27 27 100.0%
Branches: 52 52 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <climits>
6 #include <cmath>
7 #include <cstdint>
8 #include <limits>
9
10 #include "kleidicv/arithmetics/scale.h"
11 #include "kleidicv/neon.h"
12 #include "kleidicv/traits.h"
13
14 namespace kleidicv::neon {
15
16 // Scale algorithm: for each value in the source,
17 // dst[i] = src[i] * scale + shift (floating point operation)
18 //
19 // Unsigned 8-bit implementation
20 //
21 // Since converting from uint8 to float32 and back takes more steps,
22 // 'ScaleTbx' saves time by pre-calculating all 256 values and uses TBLs
23 // and TBXs to map the values directly from uint8 to uint8:
24 // i: 0 to 255: tbl[i] = i * scale + shift
25 //
26 // Since a single TBL intruction can map only 16 values, more TBX instructions
27 // needed for the remaining 240 values. After the first TBL (that replaces
28 // 0-15 values with indexed values from the table) 16 is subtracted from all
29 // lanes in the source vector before the next TBX is done, so when indexing 0
30 // to 15, actually 16 to 31 values are replaced from the original source vector.
31 //
32 // Example:
33 // scale = 1
34 // shift = 100
35 // Initialization: (it also takes time, so for short inputs it's not used)
36 // tbl = [ 100, 101, 102, ..., 255, <100 times 255, it's saturated>]
37 // Copy table to vector registers:
38 // t0 = [ 100, ..., 115 ]
39 // t1 = [ 116, ..., 131 ]
40 // t2 = [ 132, ..., 147 ]
41 // ...
42 // t15 = [ 255, ..., 255 ]
43 //
44 // input: v = [ 21, 3, 39, 6 ]
45 // TBL(t0): d = [ 0, 103, 0, 106 ] // index > 16 result in 0
46 // SUB: v = [ 5, 243, 23, 246 ] // subtracted 16 --> next table
47 // TBX(t1): d = [ 121, 103, 0, 106 ] // index > 16 are ignored
48 // SUB: v = [ 245, 227, 7, 230 ] // subtracted 16 --> next table
49 // TBX(t2): d = [ 121, 103, 107, 106 ] // index > 16 are ignored
50 // ... etc.
51 //
52 // Bigger index tables (32, 48 or 64 values) can be used by TBX2 - TBX3 - TBX4.
53 // In this case, instead of 16, 2/3/4 * 16 have to be subtracted from source.
54 // The below solution (combining TBX2-TBX3) gives a good compromise between code
55 // size and speed.
56
57 template <typename ScalarType>
58 class ScaleIntBase : public UnrollTwice {
59 public:
60 340 ScaleIntBase(float scale, float shift) : scale_{scale}, shift_{shift} {}
61
62 protected:
63 static constexpr ScalarType ScalarMax =
64 std::numeric_limits<ScalarType>::max();
65
66 float scale_, shift_;
67 };
68
69 template <typename T, typename U>
70 kleidicv_error_t scale(const T *src, size_t src_stride, U *dst,
71 size_t dst_stride, size_t width, size_t height,
72 double scale, double shift);
73
74 template <typename T>
75 2570 T scale_value(T value, double scale, double shift) {
76 static constexpr T ScalarMax = std::numeric_limits<T>::max();
77 2570 int64_t v = lrintf(static_cast<float>(value) * scale + shift);
78
2/2
✓ Branch 0 taken 2166 times.
✓ Branch 1 taken 404 times.
2570 if (static_cast<uint64_t>(v) <= ScalarMax) {
79 2166 return static_cast<T>(v);
80 }
81 404 return static_cast<T>(v > 0 ? ScalarMax : 0);
82 2570 }
83
84 class ScaleUint8Tbx final : public ScaleIntBase<uint8_t> {
85 public:
86 using ScalarType = uint8_t;
87 using VecTraits = neon::VecTraits<ScalarType>;
88 using VectorType = typename VecTraits::VectorType;
89 using Vector2Type = typename VecTraits::Vector2Type;
90 using Vector3Type = typename VecTraits::Vector3Type;
91
92 168 ScaleUint8Tbx(float scale, float shift, const ScalarType *precalculated_table)
93 168 : ScaleIntBase<uint8_t>(scale, shift),
94 168 table_pointer_(precalculated_table),
95 168 v_step3_(vdupq_n_u8(3 * VecTraits::num_lanes())),
96 168 v_step2_(vdupq_n_u8(2 * VecTraits::num_lanes())) {
97 168 VecTraits::load(precalculated_table, t0_3_);
98 168 VecTraits::load(precalculated_table + 3 * VecTraits::num_lanes(), t1_3_);
99 336 VecTraits::load(precalculated_table + (3 + 3) * VecTraits::num_lanes(),
100 168 t2_2_);
101 336 VecTraits::load(precalculated_table + (3 + 3 + 2) * VecTraits::num_lanes(),
102 168 t3_3_);
103 168 VecTraits::load(
104 168 precalculated_table + (3 + 3 + 2 + 3) * VecTraits::num_lanes(), t4_2_);
105 168 VecTraits::load(
106 168 precalculated_table + (3 + 3 + 2 + 3 + 2) * VecTraits::num_lanes(),
107 168 t5_3_);
108 168 }
109 1992 VectorType vector_path(VectorType src) {
110 1992 VectorType dst = vqtbl3q_u8(t0_3_, src);
111 1992 src = vsubq_u8(src, v_step3_);
112 1992 dst = vqtbx3q_u8(dst, t1_3_, src);
113 1992 src = vsubq_u8(src, v_step3_);
114 1992 dst = vqtbx2q_u8(dst, t2_2_, src);
115 1992 src = vsubq_u8(src, v_step2_);
116 1992 dst = vqtbx3q_u8(dst, t3_3_, src);
117 1992 src = vsubq_u8(src, v_step3_);
118 1992 dst = vqtbx2q_u8(dst, t4_2_, src);
119 1992 src = vsubq_u8(src, v_step2_);
120 1992 dst = vqtbx3q_u8(dst, t5_3_, src);
121 3984 return dst;
122 1992 }
123
124 1326 ScalarType scalar_path(ScalarType src) { return table_pointer_[src]; }
125
126 private:
127 const ScalarType *table_pointer_;
128 168 Vector3Type t0_3_{}, t1_3_{}, t3_3_{}, t5_3_{};
129 168 Vector2Type t2_2_{}, t4_2_{};
130 VectorType v_step3_, v_step2_;
131 }; // end of class ScaleUint8Tbx<T>
132
133 // Opposite to ScaleUint8Tbx, ScaleUint8Calc is the direct approach:
134 // - calculate dst[i] = src[i] * scale + shift using vector instructions
135 class ScaleUint8Calc final : public ScaleIntBase<uint8_t> {
136 public:
137 using ScalarType = uint8_t;
138 using VecTraits = neon::VecTraits<ScalarType>;
139 using VectorType = typename VecTraits::VectorType;
140
141 172 ScaleUint8Calc(float scale, float shift)
142 172 : ScaleIntBase<ScalarType>(scale, shift),
143 172 vscale_{vdupq_n_f32(scale)},
144 172 vshift_{vdupq_n_f32(shift)} {}
145
146 1320 VectorType vector_path(VectorType src) {
147 // For scaling, uint8 values have to be converted to uint32
148 // i.e. create four vectors from one
149 1320 uint32x4_t res11 = scale_shift(vqtbl1q_u8(src, w0));
150 1320 uint32x4_t res12 = scale_shift(vqtbl1q_u8(src, w1));
151 1320 uint32x4_t res21 = scale_shift(vqtbl1q_u8(src, w2));
152 1320 uint32x4_t res22 = scale_shift(vqtbl1q_u8(src, w3));
153 // Convert back from 32-bit: top two bytes are 0 for sure, unzip them
154 2640 uint16x8_t res1 =
155 1320 vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
156 2640 uint16x8_t res2 =
157 1320 vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
158
159 // Saturating narrowing from 16 to 8 bits
160 2640 return vqmovn_high_u16(vqmovn_u16(res1), res2);
161 1320 }
162
163 2570 ScalarType scalar_path(ScalarType src) {
164 2570 return scale_value(src, scale_, shift_);
165 }
166
167 private:
168 static constexpr ScalarType FF = std::numeric_limits<uint8_t>::max();
169 // clang-format off
170 static constexpr uint8x16_t w0 = { 0, FF, FF, FF, 1, FF, FF, FF, 2, FF, FF, FF, 3, FF, FF, FF};
171 static constexpr uint8x16_t w1 = { 4, FF, FF, FF, 5, FF, FF, FF, 6, FF, FF, FF, 7, FF, FF, FF};
172 static constexpr uint8x16_t w2 = { 8, FF, FF, FF, 9, FF, FF, FF, 10, FF, FF, FF, 11, FF, FF, FF};
173 static constexpr uint8x16_t w3 = {12, FF, FF, FF, 13, FF, FF, FF, 14, FF, FF, FF, 15, FF, FF, FF};
174 // clang-format on
175
176 // Convert from uint32 to float32, scale and convert back with rounding
177 5280 inline uint32x4_t scale_shift(VectorType src) {
178 5280 float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
179 // scale + shift is done by MLA
180 10560 return vcvtnq_u32_f32(vmlaq_f32(vshift_, fx, vscale_));
181 5280 }
182
183 float32x4_t vscale_, vshift_;
184 }; // end of class ScaleUint8Calc<T>
185
186 168 kleidicv_error_t scale_with_precalculated_table_u8(
187 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
188 size_t width, size_t height, double scale, double shift,
189 const std::array<uint8_t, 256> &precalculated_table) {
190 168 Rectangle rect{width, height};
191 168 Rows<const uint8_t> src_rows{src, src_stride};
192 168 Rows<uint8_t> dst_rows{dst, dst_stride};
193 336 ScaleUint8Tbx operation(static_cast<float>(scale), static_cast<float>(shift),
194 168 precalculated_table.data());
195 168 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
196
197 168 return KLEIDICV_OK;
198 168 }
199
200 // Specialization for uint8_t to uint8_t
201 template <>
202 200 kleidicv_error_t scale(const uint8_t *src, size_t src_stride, uint8_t *dst,
203 size_t dst_stride, size_t width, size_t height,
204 double scale, double shift) {
205
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 196 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 196 times.
200 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
206
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 192 times.
196 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
207
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 188 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 184 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 184 times.
192 CHECK_IMAGE_SIZE(width, height);
208 // For smaller inputs, the full calculation is the faster
209
2/2
✓ Branch 0 taken 172 times.
✓ Branch 1 taken 12 times.
184 if (width * height < 675) { // empirical value
210 172 Rectangle rect{width, height};
211 172 Rows<const uint8_t> src_rows{src, src_stride};
212 172 Rows<uint8_t> dst_rows{dst, dst_stride};
213 344 ScaleUint8Calc operation(static_cast<float>(scale),
214 172 static_cast<float>(shift));
215 172 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
216 172 } else {
217 // For bigger inputs, it's faster to pre-calculate the table
218 // and map those values during the run
219 12 auto precalculated_table = precalculate_scale_table_u8(scale, shift);
220 24 return scale_with_precalculated_table_u8(src, src_stride, dst, dst_stride,
221 12 width, height, scale, shift,
222 precalculated_table);
223 12 }
224 172 return KLEIDICV_OK;
225 200 }
226
227 7936 static uint32x4_t scale_shift(uint32x4_t src, float scale, float shift) {
228 7936 float32x4_t fx = vcvtq_f32_u32(src);
229 7936 float32x4_t max = vdupq_n_f32(255.0F);
230 7936 float32x4_t min = vdupq_n_f32(0.0F);
231 7936 float32x4_t val = vmlaq_f32(vdupq_n_f32(shift), fx, vdupq_n_f32(scale));
232 15872 return vcvtnq_u32_f32(vmaxq_f32(min, vminq_f32(val, max)));
233 7936 }
234
235 124 std::array<uint8_t, 256> precalculate_scale_table_u8(double dscale,
236 double dshift) {
237 124 float scale = static_cast<float>(dscale);
238 124 float shift = static_cast<float>(dshift);
239 static constexpr size_t TableLength = 256;
240 124 std::array<uint8_t, TableLength> precalculated_table{};
241
242 124 uint32x4_t counter = {0, 1, 2, 3};
243 124 uint32x4_t four = vdupq_n_u32(4);
244
245
2/2
✓ Branch 0 taken 124 times.
✓ Branch 1 taken 1984 times.
2108 for (size_t i = 0; i < TableLength; i += 16) {
246 1984 uint32x4_t res11 = scale_shift(counter, scale, shift);
247 1984 counter = vaddq(counter, four);
248 1984 uint32x4_t res12 = scale_shift(counter, scale, shift);
249 1984 counter = vaddq(counter, four);
250 1984 uint32x4_t res21 = scale_shift(counter, scale, shift);
251 1984 counter = vaddq(counter, four);
252 1984 uint32x4_t res22 = scale_shift(counter, scale, shift);
253 1984 counter = vaddq(counter, four);
254
255 3968 uint16x8_t res1 =
256 1984 vuzp1q_u16(vreinterpretq_u16_u32(res11), vreinterpretq_u16_u32(res12));
257 3968 uint16x8_t res2 =
258 1984 vuzp1q_u16(vreinterpretq_u16_u32(res21), vreinterpretq_u16_u32(res22));
259 // Saturating narrowing from 16 to 8 bits
260 1984 uint8x16_t res = vqmovn_high_u16(vqmovn_u16(res1), res2);
261
262 1984 vst1q_u8(&precalculated_table[i], res);
263 1984 }
264 return precalculated_table;
265 124 }
266
267 // -----------------------------------------------------------------------
268 // Float implementation
269 // -----------------------------------------------------------------------
270
271 class AddFloat final : public UnrollTwice, public UnrollOnce {
272 public:
273 using ScalarType = float;
274 using VecTraits = neon::VecTraits<ScalarType>;
275 using VectorType = typename VecTraits::VectorType;
276
277 6 explicit AddFloat(float shift) : shift_{shift}, vshift_{vdupq_n_f32(shift)} {}
278
279 5031 VectorType vector_path(VectorType src) { return vaddq_f32(vshift_, src); }
280
281 // NOLINTBEGIN(readability-make-member-function-const)
282 13 ScalarType scalar_path(ScalarType src) { return src + shift_; }
283 // NOLINTEND(readability-make-member-function-const)
284
285 private:
286 float shift_;
287 float32x4_t vshift_;
288 }; // end of class AddFloat
289
290 class ScaleFloat final : public UnrollTwice, public UnrollOnce {
291 public:
292 using ScalarType = float;
293 using VecTraits = neon::VecTraits<ScalarType>;
294 using VectorType = typename VecTraits::VectorType;
295
296 85 ScaleFloat(float scale, float shift)
297 85 : scale_{scale},
298 85 shift_{shift},
299 85 vscale_{vdupq_n_f32(scale)},
300 85 vshift_{vdupq_n_f32(shift)} {}
301
302 6854 VectorType vector_path(VectorType src) {
303 6854 return vmlaq_f32(vshift_, src, vscale_);
304 }
305
306 // NOLINTBEGIN(readability-make-member-function-const)
307 76 ScalarType scalar_path(ScalarType src) { return src * scale_ + shift_; }
308 // NOLINTEND(readability-make-member-function-const)
309
310 private:
311 float scale_, shift_;
312 float32x4_t vscale_, vshift_;
313 }; // end of class ScaleFloat
314
315 // Specialization for float to float
316 template <>
317 97 kleidicv_error_t scale(const float *src, size_t src_stride, float *dst,
318 size_t dst_stride, size_t width, size_t height,
319 double scale, double shift) {
320
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 95 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 95 times.
97 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
321
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 93 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 93 times.
95 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
322
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 92 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 91 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 91 times.
93 CHECK_IMAGE_SIZE(width, height);
323
324 91 Rectangle rect{width, height};
325 91 Rows<const float> src_rows{src, src_stride};
326 91 Rows<float> dst_rows{dst, dst_stride};
327
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 85 times.
91 if (scale == 1.0) {
328 6 AddFloat operation(static_cast<float>(shift));
329 6 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
330 6 } else {
331 85 ScaleFloat operation(static_cast<float>(scale), static_cast<float>(shift));
332 85 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
333 85 }
334 91 return KLEIDICV_OK;
335 97 }
336
337 // -----------------------------------------------------------------------
338 // Scale uint8 to float16
339 // -----------------------------------------------------------------------
340
341 class ScaleUint8ToFloat16 {
342 public:
343 using SrcType = uint8_t;
344 using SrcVecTraits = neon::VecTraits<SrcType>;
345 using SrcVectorType = typename SrcVecTraits::VectorType;
346 using SrcVector2Type = typename SrcVecTraits::Vector2Type;
347 using DstType = float16_t;
348 using DstVecTraits = neon::VecTraits<DstType>;
349 using DstVectorType = typename DstVecTraits::VectorType;
350 using DstVector2Type = typename DstVecTraits::Vector2Type;
351 using DstVector4Type = typename DstVecTraits::Vector4Type;
352
353 96 ScaleUint8ToFloat16(float scale, float shift)
354 96 : scale_{scale},
355 96 shift_{shift},
356 96 vscale_{vdupq_n_f32(scale)},
357 96 vshift_{vdupq_n_f32(shift)} {}
358
359 191 void process_row(size_t width, Columns<const SrcType> src,
360 Columns<DstType> dst) {
361 382 LoopUnroll{width, SrcVecTraits::num_lanes()}
362 671 .unroll_twice([&](size_t step) {
363 480 SrcVector2Type src_2vec;
364 480 SrcVecTraits::load(&src[0], src_2vec);
365 480 DstVector2Type dst_2vec1 = vector_path(src_2vec.val[0]);
366 480 DstVector2Type dst_2vec2 = vector_path(src_2vec.val[1]);
367 480 DstVector4Type dst_4vec = {
368 1920 dst_2vec1.val[0],
369 480 dst_2vec1.val[1],
370 480 dst_2vec2.val[0],
371 480 dst_2vec2.val[1],
372 };
373 480 DstVecTraits::store(dst_4vec, &dst[0]);
374 480 src += ptrdiff_t(step);
375 480 dst += ptrdiff_t(step);
376 480 })
377 317 .remaining([&](size_t length, size_t) {
378
2/2
✓ Branch 0 taken 126 times.
✓ Branch 1 taken 1807 times.
1933 for (ptrdiff_t index = 0; index < static_cast<ptrdiff_t>(length);
379 1807 ++index) {
380 1807 disable_loop_vectorization();
381 1807 dst[index] = static_cast<float16_t>(
382 1807 static_cast<float>(src[index]) * scale_ + shift_);
383 1807 }
384 126 });
385 191 }
386
387 private:
388 960 DstVector2Type vector_path(SrcVectorType src) {
389 // For scaling, uint8 values have to be converted to uint32
390 // i.e. create four vectors from one
391 960 float32x4_t res0 = scale_shift(vqtbl1q_u8(src, kW0));
392 960 float32x4_t res1 = scale_shift(vqtbl1q_u8(src, kW1));
393 960 float32x4_t res2 = scale_shift(vqtbl1q_u8(src, kW2));
394 960 float32x4_t res3 = scale_shift(vqtbl1q_u8(src, kW3));
395 // Convert from 32-bit to 16-bit
396 960 float16x4_t res16_0 = vcvt_f16_f32(res0);
397 960 float16x4_t res16_2 = vcvt_f16_f32(res2);
398 DstVector2Type res;
399 960 res.val[0] = vcvt_high_f16_f32(res16_0, res1);
400 960 res.val[1] = vcvt_high_f16_f32(res16_2, res3);
401 return res;
402 960 }
403
404 // Convert from uint32 to float32 and scale it
405 3840 inline float32x4_t scale_shift(SrcVectorType src) {
406 3840 float32x4_t fx = vcvtq_f32_u32(vreinterpretq_u32_u8(src));
407 7680 return vmlaq_f32(vshift_, fx, vscale_);
408 3840 }
409
410 static constexpr SrcType kFF = std::numeric_limits<SrcType>::max();
411 // clang-format off
412 static constexpr uint8x16_t kW0 = { 0, kFF, kFF, kFF, 1, kFF, kFF, kFF, 2, kFF, kFF, kFF, 3, kFF, kFF, kFF};
413 static constexpr uint8x16_t kW1 = { 4, kFF, kFF, kFF, 5, kFF, kFF, kFF, 6, kFF, kFF, kFF, 7, kFF, kFF, kFF};
414 static constexpr uint8x16_t kW2 = { 8, kFF, kFF, kFF, 9, kFF, kFF, kFF, 10, kFF, kFF, kFF, 11, kFF, kFF, kFF};
415 static constexpr uint8x16_t kW3 = {12, kFF, kFF, kFF, 13, kFF, kFF, kFF, 14, kFF, kFF, kFF, 15, kFF, kFF, kFF};
416 // clang-format on
417
418 float scale_, shift_;
419 float32x4_t vscale_, vshift_;
420 }; // end of class ScaleUint8ToFloat16
421
422 // Specialization for uint8_t to float16_t
423 template <>
424 100 kleidicv_error_t scale(const uint8_t *src, size_t src_stride, float16_t *dst,
425 size_t dst_stride, size_t width, size_t height,
426 double scale, double shift) {
427
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 99 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 99 times.
100 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
428
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 98 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 98 times.
99 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
429
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 97 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 96 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 96 times.
98 CHECK_IMAGE_SIZE(width, height);
430
431 96 Rectangle rect{width, height};
432 96 Rows<const uint8_t> src_rows{src, src_stride};
433 96 Rows<float16_t> dst_rows{dst, dst_stride};
434 192 ScaleUint8ToFloat16 operation(static_cast<float>(scale),
435 96 static_cast<float>(shift));
436 96 zip_rows(operation, rect, src_rows, dst_rows);
437 96 return KLEIDICV_OK;
438 100 }
439
440 } // namespace kleidicv::neon
441