KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/neon_intrinsics.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 129 129 100.0%
Functions: 161 161 100.0%
Branches: 0 0 -%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_NEON_INTRINSICS_H
6 #define KLEIDICV_NEON_INTRINSICS_H
7
8 #ifndef KLEIDICV_NEON_H
9 #error "Please include neon.h instead."
10 #endif
11
12 #include <arm_neon.h>
13
14 #include <cinttypes>
15
16 namespace kleidicv::neon {
17
18 // -----------------------------------------------------------------------------
19 // NEON binary operations
20 // -----------------------------------------------------------------------------
21
22 #define NEON_BINARY_OP_Q_B8_B16_B32_B64(name) \
23 static inline int8x16_t name(int8x16_t lhs, int8x16_t rhs) { \
24 return name##_s8(lhs, rhs); \
25 } \
26 \
27 static inline uint8x16_t name(uint8x16_t lhs, uint8x16_t rhs) { \
28 return name##_u8(lhs, rhs); \
29 } \
30 \
31 static inline int16x8_t name(int16x8_t lhs, int16x8_t rhs) { \
32 return name##_s16(lhs, rhs); \
33 } \
34 \
35 static inline uint16x8_t name(uint16x8_t lhs, uint16x8_t rhs) { \
36 return name##_u16(lhs, rhs); \
37 } \
38 \
39 static inline int32x4_t name(int32x4_t lhs, int32x4_t rhs) { \
40 return name##_s32(lhs, rhs); \
41 } \
42 \
43 static inline uint32x4_t name(uint32x4_t lhs, uint32x4_t rhs) { \
44 return name##_u32(lhs, rhs); \
45 } \
46 \
47 static inline int64x2_t name(int64x2_t lhs, int64x2_t rhs) { \
48 return name##_s64(lhs, rhs); \
49 } \
50 \
51 static inline uint64x2_t name(uint64x2_t lhs, uint64x2_t rhs) { \
52 return name##_u64(lhs, rhs); \
53 }
54
55 // Alphabetical order
56 1329567 NEON_BINARY_OP_Q_B8_B16_B32_B64(vaddq);
57 2319570 NEON_BINARY_OP_Q_B8_B16_B32_B64(vceqq);
58 903 NEON_BINARY_OP_Q_B8_B16_B32_B64(vcleq);
59 607 NEON_BINARY_OP_Q_B8_B16_B32_B64(vcgeq);
60 14492 NEON_BINARY_OP_Q_B8_B16_B32_B64(vqaddq);
61 25494 NEON_BINARY_OP_Q_B8_B16_B32_B64(vqsubq);
62 1542528 NEON_BINARY_OP_Q_B8_B16_B32_B64(vsubq);
63 6050012 NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn1q);
64 6050012 NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn2q);
65 7008 NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp1q);
66 7008 NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp2q);
67 109424 NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip1q);
68 109424 NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip2q);
69
70 #undef NEON_BINARY_OP_Q_B8_B16_B32_B64
71
72 #define NEON_BINARY_OP_Q_F32_F64(name) \
73 static inline float32x4_t name(float32x4_t lhs, float32x4_t rhs) { \
74 return name##_f32(lhs, rhs); \
75 } \
76 \
77 static inline float64x2_t name(float64x2_t lhs, float64x2_t rhs) { \
78 return name##_f64(lhs, rhs); \
79 }
80
81 552 NEON_BINARY_OP_Q_F32_F64(vaddq);
82
83 #undef NEON_BINARY_OP_Q_F32_F64
84
85 // clang-format off
86
87 // -----------------------------------------------------------------------------
88 // vaddv*
89 // -----------------------------------------------------------------------------
90
91 static inline int8_t vaddvq(int8x16_t vec) { return vaddvq_s8(vec); }
92 static inline uint8_t vaddvq(uint8x16_t vec) { return vaddvq_u8(vec); }
93 static inline int16_t vaddvq(int16x8_t vec) { return vaddvq_s16(vec); }
94 static inline uint16_t vaddvq(uint16x8_t vec) { return vaddvq_u16(vec); }
95 static inline int32_t vaddvq(int32x4_t vec) { return vaddvq_s32(vec); }
96 static inline uint32_t vaddvq(uint32x4_t vec) { return vaddvq_u32(vec); }
97 static inline int64_t vaddvq(int64x2_t vec) { return vaddvq_s64(vec); }
98 static inline uint64_t vaddvq(uint64x2_t vec) { return vaddvq_u64(vec); }
99 static inline float32_t vaddvq(float32x4_t vec) { return vaddvq_f32(vec); }
100 14 static inline float64_t vaddvq(float64x2_t vec) { return vaddvq_f64(vec); }
101
102 // -----------------------------------------------------------------------------
103 // vabd*
104 // -----------------------------------------------------------------------------
105
106 static inline int8x16_t vabdq(int8x16_t lhs, int8x16_t rhs) { return vabdq_s8(lhs, rhs); }
107 460 static inline uint8x16_t vabdq(uint8x16_t lhs, uint8x16_t rhs) { return vabdq_u8(lhs, rhs); }
108 static inline int16x8_t vabdq(int16x8_t lhs, int16x8_t rhs) { return vabdq_s16(lhs, rhs); }
109 918 static inline uint16x8_t vabdq(uint16x8_t lhs, uint16x8_t rhs) { return vabdq_u16(lhs, rhs); }
110 static inline int32x4_t vabdq(int32x4_t lhs, int32x4_t rhs) { return vabdq_s32(lhs, rhs); }
111 static inline uint32x4_t vabdq(uint32x4_t lhs, uint32x4_t rhs) { return vabdq_u32(lhs, rhs); }
112
113 // -----------------------------------------------------------------------------
114 // vand*
115 // -----------------------------------------------------------------------------
116
117 920 static inline uint8x16_t vandq(uint8x16_t lhs, uint8x16_t rhs) { return vandq_u8(lhs, rhs); }
118 19408 static inline uint16x8_t vandq(uint16x8_t lhs, uint16x8_t rhs) { return vandq_u16(lhs, rhs); }
119 237316 static inline uint32x4_t vandq(uint32x4_t lhs, uint32x4_t rhs) { return vandq_u32(lhs, rhs); }
120
121 // -----------------------------------------------------------------------------
122 // vqabs*
123 // -----------------------------------------------------------------------------
124
125 500 static inline int8x16_t vqabsq(int8x16_t vec) { return vqabsq_s8(vec); }
126 958 static inline int16x8_t vqabsq(int16x8_t vec) { return vqabsq_s16(vec); }
127 1852 static inline int32x4_t vqabsq(int32x4_t vec) { return vqabsq_s32(vec); }
128 static inline int64x2_t vqabsq(int64x2_t vec) { return vqabsq_s64(vec); }
129
130 // -----------------------------------------------------------------------------
131 // vaddl*
132 // -----------------------------------------------------------------------------
133
134 static inline int16x8_t vaddl(int8x8_t lhs, int8x8_t rhs) { return vaddl_s8(lhs, rhs); }
135 228 static inline uint16x8_t vaddl(uint8x8_t lhs, uint8x8_t rhs) { return vaddl_u8(lhs, rhs); }
136 static inline int32x4_t vaddl(int16x4_t lhs, int16x4_t rhs) { return vaddl_s16(lhs, rhs); }
137 static inline uint32x4_t vaddl(uint16x4_t lhs, uint16x4_t rhs) { return vaddl_u16(lhs, rhs); }
138 static inline int64x2_t vaddl(int32x2_t lhs, int32x2_t rhs) { return vaddl_s32(lhs, rhs); }
139 static inline uint64x2_t vaddl(uint32x2_t lhs, uint32x2_t rhs) { return vaddl_u32(lhs, rhs); }
140
141 // -----------------------------------------------------------------------------
142 // vbslq*
143 // -----------------------------------------------------------------------------
144
145 static inline int8x16_t vbslq(int8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); }
146 static inline uint8x16_t vbslq(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); }
147 static inline int16x8_t vbslq(int16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); }
148 static inline uint16x8_t vbslq(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); }
149 static inline int32x4_t vbslq(int32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); }
150 static inline uint32x4_t vbslq(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); }
151 static inline float32x4_t vbslq(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); }
152
153 // -----------------------------------------------------------------------------
154 // vget_high*
155 // -----------------------------------------------------------------------------
156
157 static inline int8x8_t vget_high(int8x16_t vec) { return vget_high_s8(vec); }
158 342 static inline uint8x8_t vget_high(uint8x16_t vec) { return vget_high_u8(vec); }
159 static inline int16x4_t vget_high(int16x8_t vec) { return vget_high_s16(vec); }
160 static inline uint16x4_t vget_high(uint16x8_t vec) { return vget_high_u16(vec); }
161 static inline int32x2_t vget_high(int32x4_t vec) { return vget_high_s32(vec); }
162 static inline uint32x2_t vget_high(uint32x4_t vec) { return vget_high_u32(vec); }
163 static inline int64x1_t vget_high(int64x2_t vec) { return vget_high_s64(vec); }
164 6898560 static inline uint64x1_t vget_high(uint64x2_t vec) { return vget_high_u64(vec); }
165 static inline float16x4_t vget_high(float16x8_t vec) { return vget_high_f16(vec); }
166 276 static inline float32x2_t vget_high(float32x4_t vec) { return vget_high_f32(vec); }
167 static inline float64x1_t vget_high(float64x2_t vec) { return vget_high_f64(vec); }
168
169 // -----------------------------------------------------------------------------
170 // vcgeq*
171 // -----------------------------------------------------------------------------
172
173 1828 static inline uint32x4_t vcgeq(float32x4_t lhs, float32x4_t rhs) { return vcgeq_f32(lhs, rhs); }
174
175 // -----------------------------------------------------------------------------
176 // vget_low*
177 // -----------------------------------------------------------------------------
178
179 992 static inline int8x8_t vget_low(int8x16_t vec) { return vget_low_s8(vec); }
180 40550 static inline uint8x8_t vget_low(uint8x16_t vec) { return vget_low_u8(vec); }
181 1904 static inline int16x4_t vget_low(int16x8_t vec) { return vget_low_s16(vec); }
182 1828 static inline uint16x4_t vget_low(uint16x8_t vec) { return vget_low_u16(vec); }
183 3692 static inline int32x2_t vget_low(int32x4_t vec) { return vget_low_s32(vec); }
184 static inline uint32x2_t vget_low(uint32x4_t vec) { return vget_low_u32(vec); }
185 static inline int64x1_t vget_low(int64x2_t vec) { return vget_low_s64(vec); }
186 6898560 static inline uint64x1_t vget_low(uint64x2_t vec) { return vget_low_u64(vec); }
187 static inline float16x4_t vget_low(float16x8_t vec) { return vget_low_f16(vec); }
188 276 static inline float32x2_t vget_low(float32x4_t vec) { return vget_low_f32(vec); }
189 static inline float64x1_t vget_low(float64x2_t vec) { return vget_low_f64(vec); }
190
191 // -----------------------------------------------------------------------------
192 // vminq*
193 // -----------------------------------------------------------------------------
194
195 249264 static inline int8x16_t vminq(int8x16_t lhs, int8x16_t rhs) { return vminq_s8(lhs, rhs); }
196 324642 static inline uint8x16_t vminq(uint8x16_t lhs, uint8x16_t rhs) { return vminq_u8(lhs, rhs); }
197 603950 static inline int16x8_t vminq(int16x8_t lhs, int16x8_t rhs) { return vminq_s16(lhs, rhs); }
198 603950 static inline uint16x8_t vminq(uint16x8_t lhs, uint16x8_t rhs) { return vminq_u16(lhs, rhs); }
199 878032 static inline int32x4_t vminq(int32x4_t lhs, int32x4_t rhs) { return vminq_s32(lhs, rhs); }
200 877324 static inline uint32x4_t vminq(uint32x4_t lhs, uint32x4_t rhs) { return vminq_u32(lhs, rhs); }
201 1208628 static inline float32x4_t vminq(float32x4_t lhs, float32x4_t rhs) { return vminq_f32(lhs, rhs); }
202
203 // -----------------------------------------------------------------------------
204 // vmaxq*
205 // -----------------------------------------------------------------------------
206
207 248536 static inline int8x16_t vmaxq(int8x16_t lhs, int8x16_t rhs) { return vmaxq_s8(lhs, rhs); }
208 323710 static inline uint8x16_t vmaxq(uint8x16_t lhs, uint8x16_t rhs) { return vmaxq_u8(lhs, rhs); }
209 602258 static inline int16x8_t vmaxq(int16x8_t lhs, int16x8_t rhs) { return vmaxq_s16(lhs, rhs); }
210 602258 static inline uint16x8_t vmaxq(uint16x8_t lhs, uint16x8_t rhs) { return vmaxq_u16(lhs, rhs); }
211 875484 static inline int32x4_t vmaxq(int32x4_t lhs, int32x4_t rhs) { return vmaxq_s32(lhs, rhs); }
212 874776 static inline uint32x4_t vmaxq(uint32x4_t lhs, uint32x4_t rhs) { return vmaxq_u32(lhs, rhs); }
213 1205196 static inline float32x4_t vmaxq(float32x4_t lhs, float32x4_t rhs) { return vmaxq_f32(lhs, rhs); }
214
215 // -----------------------------------------------------------------------------
216 // vminvq*
217 // -----------------------------------------------------------------------------
218
219 86 static inline int8_t vminvq(int8x16_t src) { return vminvq_s8(src); }
220 406 static inline uint8_t vminvq(uint8x16_t src) { return vminvq_u8(src); }
221 86 static inline int16_t vminvq(int16x8_t src) { return vminvq_s16(src); }
222 86 static inline uint16_t vminvq(uint16x8_t src) { return vminvq_u16(src); }
223 86 static inline int32_t vminvq(int32x4_t src) { return vminvq_s32(src); }
224 static inline uint32_t vminvq(uint32x4_t src) { return vminvq_u32(src); }
225 88 static inline float32_t vminvq(float32x4_t src) { return vminvq_f32(src); }
226
227 // -----------------------------------------------------------------------------
228 // vmaxvq*
229 // -----------------------------------------------------------------------------
230
231 86 static inline int8_t vmaxvq(int8x16_t src) { return vmaxvq_s8(src); }
232 86 static inline uint8_t vmaxvq(uint8x16_t src) { return vmaxvq_u8(src); }
233 86 static inline int16_t vmaxvq(int16x8_t src) { return vmaxvq_s16(src); }
234 86 static inline uint16_t vmaxvq(uint16x8_t src) { return vmaxvq_u16(src); }
235 86 static inline int32_t vmaxvq(int32x4_t src) { return vmaxvq_s32(src); }
236 static inline uint32_t vmaxvq(uint32x4_t src) { return vmaxvq_u32(src); }
237 88 static inline float32_t vmaxvq(float32x4_t src) { return vmaxvq_f32(src); }
238
239 // -----------------------------------------------------------------------------
240 // vcleq*
241 // -----------------------------------------------------------------------------
242
243 1828 static inline uint32x4_t vcleq(float32x4_t lhs, float32x4_t rhs) { return vcleq_f32(lhs, rhs); }
244
245 // -----------------------------------------------------------------------------
246 // vrshrn_n*
247 // -----------------------------------------------------------------------------
248
249 template <int n> static inline int8x8_t vrshrn_n(int16x8_t vec) { return vrshrn_n_s16(vec, n); }
250 176 template <int n> static inline uint8x8_t vrshrn_n(uint16x8_t vec) { return vrshrn_n_u16(vec, n); }
251 template <int n> static inline int16x4_t vrshrn_n(int32x4_t vec) { return vrshrn_n_s32(vec, n); }
252 template <int n> static inline uint16x4_t vrshrn_n(uint32x4_t vec) { return vrshrn_n_u32(vec, n); }
253 template <int n> static inline int32x2_t vrshrn_n(int64x2_t vec) { return vrshrn_n_s64(vec, n); }
254 template <int n> static inline uint32x2_t vrshrn_n(uint64x2_t vec) { return vrshrn_n_u64(vec, n); }
255
256 // -----------------------------------------------------------------------------
257 // vshrq_n*
258 // -----------------------------------------------------------------------------
259
260 template <int n> static inline int8x16_t vshrq_n(int8x16_t vec) { return vshrq_n_s8(vec, n); }
261 116460 template <int n> static inline uint8x16_t vshrq_n(uint8x16_t vec) { return vshrq_n_u8(vec, n); }
262 template <int n> static inline int16x8_t vshrq_n(int16x8_t vec) { return vshrq_n_s16(vec, n); }
263 template <int n> static inline uint16x8_t vshrq_n(uint16x8_t vec) { return vshrq_n_u16(vec, n); }
264 template <int n> static inline int32x4_t vshrq_n(int32x4_t vec) { return vshrq_n_s32(vec, n); }
265 template <int n> static inline uint32x4_t vshrq_n(uint32x4_t vec) { return vshrq_n_u32(vec, n); }
266 template <int n> static inline int64x2_t vshrq_n(int64x2_t vec) { return vshrq_n_s64(vec, n); }
267 template <int n> static inline uint64x2_t vshrq_n(uint64x2_t vec) { return vshrq_n_u64(vec, n); }
268
269 // -----------------------------------------------------------------------------
270
271 // -----------------------------------------------------------------------------
272 // vshll_n*
273 // -----------------------------------------------------------------------------
274
275 template <int n> static inline int16x8_t vshll_n(int8x8_t vec) { return vshll_n_s8(vec, n); }
276 228 template <int n> static inline uint16x8_t vshll_n(uint8x8_t vec) { return vshll_n_u8(vec, n); }
277 template <int n> static inline int32x4_t vshll_n(int16x4_t vec) { return vshll_n_s16(vec, n); }
278 template <int n> static inline uint32x4_t vshll_n(uint16x4_t vec) { return vshll_n_u16(vec, n); }
279 template <int n> static inline int64x2_t vshll_n(int32x2_t vec) { return vshll_n_s32(vec, n); }
280 template <int n> static inline uint64x2_t vshll_n(uint32x2_t vec) { return vshll_n_u32(vec, n); }
281
282 // -----------------------------------------------------------------------------
283 // vshlq_n*
284 // -----------------------------------------------------------------------------
285
286 template <int n> static inline int8x16_t vshlq_n(int8x16_t vec) { return vshlq_n_s8(vec, n); }
287 template <int n> static inline uint8x16_t vshlq_n(uint8x16_t vec) { return vshlq_n_u8(vec, n); }
288 template <int n> static inline int16x8_t vshlq_n(int16x8_t vec) { return vshlq_n_s16(vec, n); }
289 176 template <int n> static inline uint16x8_t vshlq_n(uint16x8_t vec) { return vshlq_n_u16(vec, n); }
290 template <int n> static inline int32x4_t vshlq_n(int32x4_t vec) { return vshlq_n_s32(vec, n); }
291 template <int n> static inline uint32x4_t vshlq_n(uint32x4_t vec) { return vshlq_n_u32(vec, n); }
292 template <int n> static inline int64x2_t vshlq_n(int64x2_t vec) { return vshlq_n_s64(vec, n); }
293 template <int n> static inline uint64x2_t vshlq_n(uint64x2_t vec) { return vshlq_n_u64(vec, n); }
294
295 // -----------------------------------------------------------------------------
296 // vdupq*
297 // -----------------------------------------------------------------------------
298
299 296 static inline int8x16_t vdupq_n(int8_t src) { return vdupq_n_s8(src); }
300 8644 static inline uint8x16_t vdupq_n(uint8_t src) { return vdupq_n_u8(src); }
301 296 static inline int16x8_t vdupq_n(int16_t src) { return vdupq_n_s16(src); }
302 296 static inline uint16x8_t vdupq_n(uint16_t src) { return vdupq_n_u16(src); }
303 5823 static inline int32x4_t vdupq_n(int32_t src) { return vdupq_n_s32(src); }
304 2624 static inline uint32x4_t vdupq_n(uint32_t src) { return vdupq_n_u32(src); }
305 static inline int64x2_t vdupq_n(int64_t src) { return vdupq_n_s64(src); }
306 static inline uint64x2_t vdupq_n(uint64_t src) { return vdupq_n_u64(src); }
307 43437 static inline float32x4_t vdupq_n(float32_t src) { return vdupq_n_f32(src); }
308
309 // -----------------------------------------------------------------------------
310 // vmull*
311 // -----------------------------------------------------------------------------
312
313 496 static inline int16x8_t vmull(int8x8_t lhs, int8x8_t rhs) { return vmull_s8(lhs, rhs); }
314 456 static inline uint16x8_t vmull(uint8x8_t lhs, uint8x8_t rhs) { return vmull_u8(lhs, rhs); }
315 952 static inline int32x4_t vmull(int16x4_t lhs, int16x4_t rhs) { return vmull_s16(lhs, rhs); }
316 914 static inline uint32x4_t vmull(uint16x4_t lhs, uint16x4_t rhs) { return vmull_u16(lhs, rhs); }
317 1846 static inline int64x2_t vmull(int32x2_t lhs, int32x2_t rhs) { return vmull_s32(lhs, rhs); }
318
319 // -----------------------------------------------------------------------------
320 // vmull_high*
321 // -----------------------------------------------------------------------------
322
323 496 static inline int16x8_t vmull_high(int8x16_t lhs, int8x16_t rhs) { return vmull_high_s8(lhs, rhs); }
324 456 static inline uint16x8_t vmull_high(uint8x16_t lhs, uint8x16_t rhs) { return vmull_high_u8(lhs, rhs); }
325 952 static inline int32x4_t vmull_high(int16x8_t lhs, int16x8_t rhs) { return vmull_high_s16(lhs, rhs); }
326 914 static inline uint32x4_t vmull_high(uint16x8_t lhs, uint16x8_t rhs) { return vmull_high_u16(lhs, rhs); }
327 1846 static inline int64x2_t vmull_high(int32x4_t lhs, int32x4_t rhs) { return vmull_high_s32(lhs, rhs); }
328
329 // -----------------------------------------------------------------------------
330 // vqmovn*
331 // -----------------------------------------------------------------------------
332
333 1447 static inline int8x8_t vqmovn(int16x8_t src) { return vqmovn_s16(src); }
334 1407 static inline uint8x8_t vqmovn(uint16x8_t src) { return vqmovn_u16(src); }
335 2854 static inline int16x4_t vqmovn(int32x4_t src) { return vqmovn_s32(src); }
336 2816 static inline uint16x4_t vqmovn(uint32x4_t src) { return vqmovn_u32(src); }
337 1846 static inline int32x2_t vqmovn(int64x2_t src) { return vqmovn_s64(src); }
338
339 // -----------------------------------------------------------------------------
340 // vqmovn_high*
341 // -----------------------------------------------------------------------------
342
343 496 static inline int8x16_t vqmovn_high(int8x8_t low, int16x8_t src) { return vqmovn_high_s16(low, src); }
344 456 static inline uint8x16_t vqmovn_high(uint8x8_t low, uint16x8_t src) { return vqmovn_high_u16(low, src); }
345 952 static inline int16x8_t vqmovn_high(int16x4_t low, int32x4_t src) { return vqmovn_high_s32(low, src); }
346 914 static inline uint16x8_t vqmovn_high(uint16x4_t low, uint32x4_t src) { return vqmovn_high_u32(low, src); }
347 1846 static inline int32x4_t vqmovn_high(int32x2_t low, int64x2_t src) { return vqmovn_high_s64(low, src); }
348
349 // -----------------------------------------------------------------------------
350 // NEON load operations
351 // -----------------------------------------------------------------------------
352
353 57020 static inline int8x16_t vld1q(const int8_t *src) { return vld1q_s8(src); }
354 1118577 static inline uint8x16_t vld1q(const uint8_t *src) { return vld1q_u8(src); }
355 151919 static inline int16x8_t vld1q(const int16_t *src) { return vld1q_s16(src); }
356 1086959 static inline uint16x8_t vld1q(const uint16_t *src) { return vld1q_u16(src); }
357 205189 static inline int32x4_t vld1q(const int32_t *src) { return vld1q_s32(src); }
358 2047048 static inline uint32x4_t vld1q(const uint32_t *src) { return vld1q_u32(src); }
359 15120 static inline int64x2_t vld1q(const int64_t *src) { return vld1q_s64(src); }
360 3700952 static inline uint64x2_t vld1q(const uint64_t *src) { return vld1q_u64(src); }
361 static inline float16x8_t vld1q(const float16_t *src) { return vld1q_f16(src); }
362 310866 static inline float32x4_t vld1q(const float32_t *src) { return vld1q_f32(src); }
363 static inline float64x2_t vld1q(const float64_t *src) { return vld1q_f64(src); }
364
365 static inline int8x16x2_t vld2q(const int8_t *src) { return vld2q_s8(src); }
366 static inline uint8x16x2_t vld2q(const uint8_t *src) { return vld2q_u8(src); }
367 static inline int16x8x2_t vld2q(const int16_t *src) { return vld2q_s16(src); }
368 static inline uint16x8x2_t vld2q(const uint16_t *src) { return vld2q_u16(src); }
369 static inline int32x4x2_t vld2q(const int32_t *src) { return vld2q_s32(src); }
370 static inline uint32x4x2_t vld2q(const uint32_t *src) { return vld2q_u32(src); }
371 static inline int64x2x2_t vld2q(const int64_t *src) { return vld2q_s64(src); }
372 static inline uint64x2x2_t vld2q(const uint64_t *src) { return vld2q_u64(src); }
373 static inline float32x4x2_t vld2q(const float32_t *src) { return vld2q_f32(src); }
374
375 static inline int8x16x3_t vld3q(const int8_t *src) { return vld3q_s8(src); }
376 static inline uint8x16x3_t vld3q(const uint8_t *src) { return vld3q_u8(src); }
377 static inline int16x8x3_t vld3q(const int16_t *src) { return vld3q_s16(src); }
378 static inline uint16x8x3_t vld3q(const uint16_t *src) { return vld3q_u16(src); }
379 static inline int32x4x3_t vld3q(const int32_t *src) { return vld3q_s32(src); }
380 static inline uint32x4x3_t vld3q(const uint32_t *src) { return vld3q_u32(src); }
381 static inline int64x2x3_t vld3q(const int64_t *src) { return vld3q_s64(src); }
382 static inline uint64x2x3_t vld3q(const uint64_t *src) { return vld3q_u64(src); }
383 static inline float32x4x3_t vld3q(const float32_t *src) { return vld3q_f32(src); }
384
385 static inline int8x16x4_t vld4q(const int8_t *src) { return vld4q_s8(src); }
386 static inline uint8x16x4_t vld4q(const uint8_t *src) { return vld4q_u8(src); }
387 static inline int16x8x4_t vld4q(const int16_t *src) { return vld4q_s16(src); }
388 static inline uint16x8x4_t vld4q(const uint16_t *src) { return vld4q_u16(src); }
389 static inline int32x4x4_t vld4q(const int32_t *src) { return vld4q_s32(src); }
390 static inline uint32x4x4_t vld4q(const uint32_t *src) { return vld4q_u32(src); }
391 static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); }
392 static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); }
393 static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); }
394
395 // -----------------------------------------------------------------------------
396 // NEON store operations
397 // -----------------------------------------------------------------------------
398
399 951 static inline void vst1(int8_t *dst, int8x8_t vec) { vst1_s8(dst, vec); }
400 2071 static inline void vst1(uint8_t *dst, uint8x8_t vec) { vst1_u8(dst, vec); }
401 static inline void vst1(int16_t *dst, int16x4_t vec) { vst1_s16(dst, vec); }
402 static inline void vst1(uint16_t *dst, uint16x4_t vec) { vst1_u16(dst, vec); }
403 static inline void vst1(int32_t *dst, int32x2_t vec) { vst1_s32(dst, vec); }
404 static inline void vst1(uint32_t *dst, uint32x2_t vec) { vst1_u32(dst, vec); }
405 static inline void vst1(int64_t *dst, int64x1_t vec) { vst1_s64(dst, vec); }
406 static inline void vst1(uint64_t *dst, uint64x1_t vec) { vst1_u64(dst, vec); }
407
408 3884 static inline void vst1q(int8_t *dst, int8x16_t vec) { vst1q_s8(dst, vec); }
409 504261 static inline void vst1q(uint8_t *dst, uint8x16_t vec) { vst1q_u8(dst, vec); }
410 22204 static inline void vst1q(int16_t *dst, int16x8_t vec) { vst1q_s16(dst, vec); }
411 941930 static inline void vst1q(uint16_t *dst, uint16x8_t vec) { vst1q_u16(dst, vec); }
412 13430 static inline void vst1q(int32_t *dst, int32x4_t vec) { vst1q_s32(dst, vec); }
413 1855000 static inline void vst1q(uint32_t *dst, uint32x4_t vec) { vst1q_u32(dst, vec); }
414 7560 static inline void vst1q(int64_t *dst, int64x2_t vec) { vst1q_s64(dst, vec); }
415 3688560 static inline void vst1q(uint64_t *dst, uint64x2_t vec) { vst1q_u64(dst, vec); }
416 100370 static inline void vst1q(float32_t *dst, float32x4_t vec) { vst1q_f32(dst, vec); }
417
418 static inline void vst2q(int8_t *dst, int8x16x2_t vec) { vst2q_s8(dst, vec); }
419 static inline void vst2q(uint8_t *dst, uint8x16x2_t vec) { vst2q_u8(dst, vec); }
420 static inline void vst2q(int16_t *dst, int16x8x2_t vec) { vst2q_s16(dst, vec); }
421 static inline void vst2q(uint16_t *dst, uint16x8x2_t vec) { vst2q_u16(dst, vec); }
422 static inline void vst2q(int32_t *dst, int32x4x2_t vec) { vst2q_s32(dst, vec); }
423 static inline void vst2q(uint32_t *dst, uint32x4x2_t vec) { vst2q_u32(dst, vec); }
424 static inline void vst2q(int64_t *dst, int64x2x2_t vec) { vst2q_s64(dst, vec); }
425 static inline void vst2q(uint64_t *dst, uint64x2x2_t vec) { vst2q_u64(dst, vec); }
426 static inline void vst2q(float32_t *dst, float32x4x2_t vec) { vst2q_f32(dst, vec); }
427
428 static inline void vst3q(int8_t *dst, int8x16x3_t vec) { vst3q_s8(dst, vec); }
429 static inline void vst3q(uint8_t *dst, uint8x16x3_t vec) { vst3q_u8(dst, vec); }
430 static inline void vst3q(int16_t *dst, int16x8x3_t vec) { vst3q_s16(dst, vec); }
431 static inline void vst3q(uint16_t *dst, uint16x8x3_t vec) { vst3q_u16(dst, vec); }
432 static inline void vst3q(int32_t *dst, int32x4x3_t vec) { vst3q_s32(dst, vec); }
433 static inline void vst3q(uint32_t *dst, uint32x4x3_t vec) { vst3q_u32(dst, vec); }
434 static inline void vst3q(int64_t *dst, int64x2x3_t vec) { vst3q_s64(dst, vec); }
435 static inline void vst3q(uint64_t *dst, uint64x2x3_t vec) { vst3q_u64(dst, vec); }
436 static inline void vst3q(float32_t *dst, float32x4x3_t vec) { vst3q_f32(dst, vec); }
437
438 static inline void vst4q(int8_t *dst, int8x16x4_t vec) { vst4q_s8(dst, vec); }
439 static inline void vst4q(uint8_t *dst, uint8x16x4_t vec) { vst4q_u8(dst, vec); }
440 static inline void vst4q(int16_t *dst, int16x8x4_t vec) { vst4q_s16(dst, vec); }
441 static inline void vst4q(uint16_t *dst, uint16x8x4_t vec) { vst4q_u16(dst, vec); }
442 static inline void vst4q(int32_t *dst, int32x4x4_t vec) { vst4q_s32(dst, vec); }
443 static inline void vst4q(uint32_t *dst, uint32x4x4_t vec) { vst4q_u32(dst, vec); }
444 static inline void vst4q(int64_t *dst, int64x2x4_t vec) { vst4q_s64(dst, vec); }
445 static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); }
446 static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); }
447
448 // -----------------------------------------------------------------------------
449 // vreinterpret*
450 // -----------------------------------------------------------------------------
451
452 static inline uint8x16_t vreinterpretq_u8(int8x16_t vec) { return vreinterpretq_u8_s8(vec); }
453 1296 static inline uint8x16_t vreinterpretq_u8(uint8x16_t vec) { return vec; }
454 22400 static inline uint8x16_t vreinterpretq_u8(int16x8_t vec) { return vreinterpretq_u8_s16(vec); }
455 44496 static inline uint8x16_t vreinterpretq_u8(uint16x8_t vec) { return vreinterpretq_u8_u16(vec); }
456 static inline uint8x16_t vreinterpretq_u8(int32x4_t vec) { return vreinterpretq_u8_s32(vec); }
457 static inline uint8x16_t vreinterpretq_u8(uint32x4_t vec) { return vreinterpretq_u8_u32(vec); }
458 static inline uint8x16_t vreinterpretq_u8(int64x2_t vec) { return vreinterpretq_u8_s64(vec); }
459 static inline uint8x16_t vreinterpretq_u8(uint64x2_t vec) { return vreinterpretq_u8_u64(vec); }
460
461 static inline uint64x2_t vreinterpretq_u64(int8x16_t vec) { return vreinterpretq_u64_s8(vec); }
462 460340 static inline uint64x2_t vreinterpretq_u64(uint8x16_t vec) { return vreinterpretq_u64_u8(vec); }
463 static inline uint64x2_t vreinterpretq_u64(int16x8_t vec) { return vreinterpretq_u64_s16(vec); }
464 920448 static inline uint64x2_t vreinterpretq_u64(uint16x8_t vec) { return vreinterpretq_u64_u16(vec); }
465 static inline uint64x2_t vreinterpretq_u64(int32x4_t vec) { return vreinterpretq_u64_s32(vec); }
466 1839744 static inline uint64x2_t vreinterpretq_u64(uint32x4_t vec) { return vreinterpretq_u64_u32(vec); }
467 static inline uint64x2_t vreinterpretq_u64(int64x2_t vec) { return vreinterpretq_u64_s64(vec); }
468 3678336 static inline uint64x2_t vreinterpretq_u64(uint64x2_t vec) { return vec; }
469
470 // -----------------------------------------------------------------------------
471 // vcombine*
472 // -----------------------------------------------------------------------------
473
474 static inline int8x16_t vcombine(int8x8_t lhs, int8x8_t rhs) { return vcombine_s8(lhs, rhs); }
475 39296 static inline uint8x16_t vcombine(uint8x8_t lhs, uint8x8_t rhs) { return vcombine_u8(lhs, rhs); }
476 951 static inline int16x8_t vcombine(int16x4_t lhs, int16x4_t rhs) { return vcombine_s16(lhs, rhs); }
477 77623 static inline uint16x8_t vcombine(uint16x4_t lhs, uint16x4_t rhs) { return vcombine_u16(lhs, rhs); }
478 static inline int32x4_t vcombine(int32x2_t lhs, int32x2_t rhs) { return vcombine_s32(lhs, rhs); }
479 static inline uint32x4_t vcombine(uint32x2_t lhs, uint32x2_t rhs) { return vcombine_u32(lhs, rhs); }
480 static inline int64x2_t vcombine(int64x1_t lhs, int64x1_t rhs) { return vcombine_s64(lhs, rhs); }
481 6898560 static inline uint64x2_t vcombine(uint64x1_t lhs, uint64x1_t rhs) { return vcombine_u64(lhs, rhs); }
482
483 // -----------------------------------------------------------------------------
484 // vrev*
485 // -----------------------------------------------------------------------------
486
487 static inline int8x16_t vrev64q(int8x16_t src) { return vrev64q_s8(src); }
488 460032 static inline uint8x16_t vrev64q(uint8x16_t src) { return vrev64q_u8(src); }
489 static inline int16x8_t vrev64q(int16x8_t src) { return vrev64q_s16(src); }
490 920448 static inline uint16x8_t vrev64q(uint16x8_t src) { return vrev64q_u16(src); }
491 static inline int32x4_t vrev64q(int32x4_t src) { return vrev64q_s32(src); }
492 1839744 static inline uint32x4_t vrev64q(uint32x4_t src) { return vrev64q_u32(src); }
493 static inline int64x2_t vrev64q(int64x2_t src) { return src; }
494 3678336 static inline uint64x2_t vrev64q(uint64x2_t src) { return src; }
495
496 // -----------------------------------------------------------------------------
497 // vcvt*
498 // -----------------------------------------------------------------------------
499
500 552 static inline float64x2_t vcvt_f64(float32x2_t vec) { return vcvt_f64_f32(vec); }
501
502 // clang-format on
503
504 } // namespace kleidicv::neon
505
506 #endif // KLEIDICV_NEON_INTRINSICS_H
507