Line |
Branch |
Exec |
Source |
1 |
|
|
// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> |
2 |
|
|
// |
3 |
|
|
// SPDX-License-Identifier: Apache-2.0 |
4 |
|
|
|
5 |
|
|
#ifndef KLEIDICV_NEON_INTRINSICS_H |
6 |
|
|
#define KLEIDICV_NEON_INTRINSICS_H |
7 |
|
|
|
8 |
|
|
#ifndef KLEIDICV_NEON_H |
9 |
|
|
#error "Please include neon.h instead." |
10 |
|
|
#endif |
11 |
|
|
|
12 |
|
|
#include <arm_neon.h> |
13 |
|
|
|
14 |
|
|
#include <cinttypes> |
15 |
|
|
|
16 |
|
|
namespace kleidicv::neon { |
17 |
|
|
|
18 |
|
|
// ----------------------------------------------------------------------------- |
19 |
|
|
// NEON binary operations |
20 |
|
|
// ----------------------------------------------------------------------------- |
21 |
|
|
|
22 |
|
|
#define NEON_BINARY_OP_Q_B8_B16_B32_B64(name) \ |
23 |
|
|
static inline int8x16_t name(int8x16_t lhs, int8x16_t rhs) { \ |
24 |
|
|
return name##_s8(lhs, rhs); \ |
25 |
|
|
} \ |
26 |
|
|
\ |
27 |
|
|
static inline uint8x16_t name(uint8x16_t lhs, uint8x16_t rhs) { \ |
28 |
|
|
return name##_u8(lhs, rhs); \ |
29 |
|
|
} \ |
30 |
|
|
\ |
31 |
|
|
static inline int16x8_t name(int16x8_t lhs, int16x8_t rhs) { \ |
32 |
|
|
return name##_s16(lhs, rhs); \ |
33 |
|
|
} \ |
34 |
|
|
\ |
35 |
|
|
static inline uint16x8_t name(uint16x8_t lhs, uint16x8_t rhs) { \ |
36 |
|
|
return name##_u16(lhs, rhs); \ |
37 |
|
|
} \ |
38 |
|
|
\ |
39 |
|
|
static inline int32x4_t name(int32x4_t lhs, int32x4_t rhs) { \ |
40 |
|
|
return name##_s32(lhs, rhs); \ |
41 |
|
|
} \ |
42 |
|
|
\ |
43 |
|
|
static inline uint32x4_t name(uint32x4_t lhs, uint32x4_t rhs) { \ |
44 |
|
|
return name##_u32(lhs, rhs); \ |
45 |
|
|
} \ |
46 |
|
|
\ |
47 |
|
|
static inline int64x2_t name(int64x2_t lhs, int64x2_t rhs) { \ |
48 |
|
|
return name##_s64(lhs, rhs); \ |
49 |
|
|
} \ |
50 |
|
|
\ |
51 |
|
|
static inline uint64x2_t name(uint64x2_t lhs, uint64x2_t rhs) { \ |
52 |
|
|
return name##_u64(lhs, rhs); \ |
53 |
|
|
} |
54 |
|
|
|
55 |
|
|
// Alphabetical order |
56 |
|
1329567 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vaddq); |
57 |
|
2319570 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vceqq); |
58 |
|
903 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vcleq); |
59 |
|
607 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vcgeq); |
60 |
|
14492 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vqaddq); |
61 |
|
25494 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vqsubq); |
62 |
|
1542528 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vsubq); |
63 |
|
6050012 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn1q); |
64 |
|
6050012 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn2q); |
65 |
|
7008 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp1q); |
66 |
|
7008 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp2q); |
67 |
|
109424 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip1q); |
68 |
|
109424 |
NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip2q); |
69 |
|
|
|
70 |
|
|
#undef NEON_BINARY_OP_Q_B8_B16_B32_B64 |
71 |
|
|
|
72 |
|
|
#define NEON_BINARY_OP_Q_F32_F64(name) \ |
73 |
|
|
static inline float32x4_t name(float32x4_t lhs, float32x4_t rhs) { \ |
74 |
|
|
return name##_f32(lhs, rhs); \ |
75 |
|
|
} \ |
76 |
|
|
\ |
77 |
|
|
static inline float64x2_t name(float64x2_t lhs, float64x2_t rhs) { \ |
78 |
|
|
return name##_f64(lhs, rhs); \ |
79 |
|
|
} |
80 |
|
|
|
81 |
|
552 |
NEON_BINARY_OP_Q_F32_F64(vaddq); |
82 |
|
|
|
83 |
|
|
#undef NEON_BINARY_OP_Q_F32_F64 |
84 |
|
|
|
85 |
|
|
// clang-format off |
86 |
|
|
|
87 |
|
|
// ----------------------------------------------------------------------------- |
88 |
|
|
// vaddv* |
89 |
|
|
// ----------------------------------------------------------------------------- |
90 |
|
|
|
91 |
|
|
static inline int8_t vaddvq(int8x16_t vec) { return vaddvq_s8(vec); } |
92 |
|
|
static inline uint8_t vaddvq(uint8x16_t vec) { return vaddvq_u8(vec); } |
93 |
|
|
static inline int16_t vaddvq(int16x8_t vec) { return vaddvq_s16(vec); } |
94 |
|
|
static inline uint16_t vaddvq(uint16x8_t vec) { return vaddvq_u16(vec); } |
95 |
|
|
static inline int32_t vaddvq(int32x4_t vec) { return vaddvq_s32(vec); } |
96 |
|
|
static inline uint32_t vaddvq(uint32x4_t vec) { return vaddvq_u32(vec); } |
97 |
|
|
static inline int64_t vaddvq(int64x2_t vec) { return vaddvq_s64(vec); } |
98 |
|
|
static inline uint64_t vaddvq(uint64x2_t vec) { return vaddvq_u64(vec); } |
99 |
|
|
static inline float32_t vaddvq(float32x4_t vec) { return vaddvq_f32(vec); } |
100 |
|
14 |
static inline float64_t vaddvq(float64x2_t vec) { return vaddvq_f64(vec); } |
101 |
|
|
|
102 |
|
|
// ----------------------------------------------------------------------------- |
103 |
|
|
// vabd* |
104 |
|
|
// ----------------------------------------------------------------------------- |
105 |
|
|
|
106 |
|
|
static inline int8x16_t vabdq(int8x16_t lhs, int8x16_t rhs) { return vabdq_s8(lhs, rhs); } |
107 |
|
460 |
static inline uint8x16_t vabdq(uint8x16_t lhs, uint8x16_t rhs) { return vabdq_u8(lhs, rhs); } |
108 |
|
|
static inline int16x8_t vabdq(int16x8_t lhs, int16x8_t rhs) { return vabdq_s16(lhs, rhs); } |
109 |
|
918 |
static inline uint16x8_t vabdq(uint16x8_t lhs, uint16x8_t rhs) { return vabdq_u16(lhs, rhs); } |
110 |
|
|
static inline int32x4_t vabdq(int32x4_t lhs, int32x4_t rhs) { return vabdq_s32(lhs, rhs); } |
111 |
|
|
static inline uint32x4_t vabdq(uint32x4_t lhs, uint32x4_t rhs) { return vabdq_u32(lhs, rhs); } |
112 |
|
|
|
113 |
|
|
// ----------------------------------------------------------------------------- |
114 |
|
|
// vand* |
115 |
|
|
// ----------------------------------------------------------------------------- |
116 |
|
|
|
117 |
|
920 |
static inline uint8x16_t vandq(uint8x16_t lhs, uint8x16_t rhs) { return vandq_u8(lhs, rhs); } |
118 |
|
19408 |
static inline uint16x8_t vandq(uint16x8_t lhs, uint16x8_t rhs) { return vandq_u16(lhs, rhs); } |
119 |
|
237316 |
static inline uint32x4_t vandq(uint32x4_t lhs, uint32x4_t rhs) { return vandq_u32(lhs, rhs); } |
120 |
|
|
|
121 |
|
|
// ----------------------------------------------------------------------------- |
122 |
|
|
// vqabs* |
123 |
|
|
// ----------------------------------------------------------------------------- |
124 |
|
|
|
125 |
|
500 |
static inline int8x16_t vqabsq(int8x16_t vec) { return vqabsq_s8(vec); } |
126 |
|
958 |
static inline int16x8_t vqabsq(int16x8_t vec) { return vqabsq_s16(vec); } |
127 |
|
1852 |
static inline int32x4_t vqabsq(int32x4_t vec) { return vqabsq_s32(vec); } |
128 |
|
|
static inline int64x2_t vqabsq(int64x2_t vec) { return vqabsq_s64(vec); } |
129 |
|
|
|
130 |
|
|
// ----------------------------------------------------------------------------- |
131 |
|
|
// vaddl* |
132 |
|
|
// ----------------------------------------------------------------------------- |
133 |
|
|
|
134 |
|
|
static inline int16x8_t vaddl(int8x8_t lhs, int8x8_t rhs) { return vaddl_s8(lhs, rhs); } |
135 |
|
228 |
static inline uint16x8_t vaddl(uint8x8_t lhs, uint8x8_t rhs) { return vaddl_u8(lhs, rhs); } |
136 |
|
|
static inline int32x4_t vaddl(int16x4_t lhs, int16x4_t rhs) { return vaddl_s16(lhs, rhs); } |
137 |
|
|
static inline uint32x4_t vaddl(uint16x4_t lhs, uint16x4_t rhs) { return vaddl_u16(lhs, rhs); } |
138 |
|
|
static inline int64x2_t vaddl(int32x2_t lhs, int32x2_t rhs) { return vaddl_s32(lhs, rhs); } |
139 |
|
|
static inline uint64x2_t vaddl(uint32x2_t lhs, uint32x2_t rhs) { return vaddl_u32(lhs, rhs); } |
140 |
|
|
|
141 |
|
|
// ----------------------------------------------------------------------------- |
142 |
|
|
// vbslq* |
143 |
|
|
// ----------------------------------------------------------------------------- |
144 |
|
|
|
145 |
|
|
static inline int8x16_t vbslq(int8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); } |
146 |
|
|
static inline uint8x16_t vbslq(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); } |
147 |
|
|
static inline int16x8_t vbslq(int16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); } |
148 |
|
|
static inline uint16x8_t vbslq(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); } |
149 |
|
|
static inline int32x4_t vbslq(int32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); } |
150 |
|
|
static inline uint32x4_t vbslq(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); } |
151 |
|
|
static inline float32x4_t vbslq(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); } |
152 |
|
|
|
153 |
|
|
// ----------------------------------------------------------------------------- |
154 |
|
|
// vget_high* |
155 |
|
|
// ----------------------------------------------------------------------------- |
156 |
|
|
|
157 |
|
|
static inline int8x8_t vget_high(int8x16_t vec) { return vget_high_s8(vec); } |
158 |
|
342 |
static inline uint8x8_t vget_high(uint8x16_t vec) { return vget_high_u8(vec); } |
159 |
|
|
static inline int16x4_t vget_high(int16x8_t vec) { return vget_high_s16(vec); } |
160 |
|
|
static inline uint16x4_t vget_high(uint16x8_t vec) { return vget_high_u16(vec); } |
161 |
|
|
static inline int32x2_t vget_high(int32x4_t vec) { return vget_high_s32(vec); } |
162 |
|
|
static inline uint32x2_t vget_high(uint32x4_t vec) { return vget_high_u32(vec); } |
163 |
|
|
static inline int64x1_t vget_high(int64x2_t vec) { return vget_high_s64(vec); } |
164 |
|
6898560 |
static inline uint64x1_t vget_high(uint64x2_t vec) { return vget_high_u64(vec); } |
165 |
|
|
static inline float16x4_t vget_high(float16x8_t vec) { return vget_high_f16(vec); } |
166 |
|
276 |
static inline float32x2_t vget_high(float32x4_t vec) { return vget_high_f32(vec); } |
167 |
|
|
static inline float64x1_t vget_high(float64x2_t vec) { return vget_high_f64(vec); } |
168 |
|
|
|
169 |
|
|
// ----------------------------------------------------------------------------- |
170 |
|
|
// vcgeq* |
171 |
|
|
// ----------------------------------------------------------------------------- |
172 |
|
|
|
173 |
|
1828 |
static inline uint32x4_t vcgeq(float32x4_t lhs, float32x4_t rhs) { return vcgeq_f32(lhs, rhs); } |
174 |
|
|
|
175 |
|
|
// ----------------------------------------------------------------------------- |
176 |
|
|
// vget_low* |
177 |
|
|
// ----------------------------------------------------------------------------- |
178 |
|
|
|
179 |
|
992 |
static inline int8x8_t vget_low(int8x16_t vec) { return vget_low_s8(vec); } |
180 |
|
40550 |
static inline uint8x8_t vget_low(uint8x16_t vec) { return vget_low_u8(vec); } |
181 |
|
1904 |
static inline int16x4_t vget_low(int16x8_t vec) { return vget_low_s16(vec); } |
182 |
|
1828 |
static inline uint16x4_t vget_low(uint16x8_t vec) { return vget_low_u16(vec); } |
183 |
|
3692 |
static inline int32x2_t vget_low(int32x4_t vec) { return vget_low_s32(vec); } |
184 |
|
|
static inline uint32x2_t vget_low(uint32x4_t vec) { return vget_low_u32(vec); } |
185 |
|
|
static inline int64x1_t vget_low(int64x2_t vec) { return vget_low_s64(vec); } |
186 |
|
6898560 |
static inline uint64x1_t vget_low(uint64x2_t vec) { return vget_low_u64(vec); } |
187 |
|
|
static inline float16x4_t vget_low(float16x8_t vec) { return vget_low_f16(vec); } |
188 |
|
276 |
static inline float32x2_t vget_low(float32x4_t vec) { return vget_low_f32(vec); } |
189 |
|
|
static inline float64x1_t vget_low(float64x2_t vec) { return vget_low_f64(vec); } |
190 |
|
|
|
191 |
|
|
// ----------------------------------------------------------------------------- |
192 |
|
|
// vminq* |
193 |
|
|
// ----------------------------------------------------------------------------- |
194 |
|
|
|
195 |
|
249264 |
static inline int8x16_t vminq(int8x16_t lhs, int8x16_t rhs) { return vminq_s8(lhs, rhs); } |
196 |
|
324642 |
static inline uint8x16_t vminq(uint8x16_t lhs, uint8x16_t rhs) { return vminq_u8(lhs, rhs); } |
197 |
|
603950 |
static inline int16x8_t vminq(int16x8_t lhs, int16x8_t rhs) { return vminq_s16(lhs, rhs); } |
198 |
|
603950 |
static inline uint16x8_t vminq(uint16x8_t lhs, uint16x8_t rhs) { return vminq_u16(lhs, rhs); } |
199 |
|
878032 |
static inline int32x4_t vminq(int32x4_t lhs, int32x4_t rhs) { return vminq_s32(lhs, rhs); } |
200 |
|
877324 |
static inline uint32x4_t vminq(uint32x4_t lhs, uint32x4_t rhs) { return vminq_u32(lhs, rhs); } |
201 |
|
1208628 |
static inline float32x4_t vminq(float32x4_t lhs, float32x4_t rhs) { return vminq_f32(lhs, rhs); } |
202 |
|
|
|
203 |
|
|
// ----------------------------------------------------------------------------- |
204 |
|
|
// vmaxq* |
205 |
|
|
// ----------------------------------------------------------------------------- |
206 |
|
|
|
207 |
|
248536 |
static inline int8x16_t vmaxq(int8x16_t lhs, int8x16_t rhs) { return vmaxq_s8(lhs, rhs); } |
208 |
|
323710 |
static inline uint8x16_t vmaxq(uint8x16_t lhs, uint8x16_t rhs) { return vmaxq_u8(lhs, rhs); } |
209 |
|
602258 |
static inline int16x8_t vmaxq(int16x8_t lhs, int16x8_t rhs) { return vmaxq_s16(lhs, rhs); } |
210 |
|
602258 |
static inline uint16x8_t vmaxq(uint16x8_t lhs, uint16x8_t rhs) { return vmaxq_u16(lhs, rhs); } |
211 |
|
875484 |
static inline int32x4_t vmaxq(int32x4_t lhs, int32x4_t rhs) { return vmaxq_s32(lhs, rhs); } |
212 |
|
874776 |
static inline uint32x4_t vmaxq(uint32x4_t lhs, uint32x4_t rhs) { return vmaxq_u32(lhs, rhs); } |
213 |
|
1205196 |
static inline float32x4_t vmaxq(float32x4_t lhs, float32x4_t rhs) { return vmaxq_f32(lhs, rhs); } |
214 |
|
|
|
215 |
|
|
// ----------------------------------------------------------------------------- |
216 |
|
|
// vminvq* |
217 |
|
|
// ----------------------------------------------------------------------------- |
218 |
|
|
|
219 |
|
86 |
static inline int8_t vminvq(int8x16_t src) { return vminvq_s8(src); } |
220 |
|
406 |
static inline uint8_t vminvq(uint8x16_t src) { return vminvq_u8(src); } |
221 |
|
86 |
static inline int16_t vminvq(int16x8_t src) { return vminvq_s16(src); } |
222 |
|
86 |
static inline uint16_t vminvq(uint16x8_t src) { return vminvq_u16(src); } |
223 |
|
86 |
static inline int32_t vminvq(int32x4_t src) { return vminvq_s32(src); } |
224 |
|
|
static inline uint32_t vminvq(uint32x4_t src) { return vminvq_u32(src); } |
225 |
|
88 |
static inline float32_t vminvq(float32x4_t src) { return vminvq_f32(src); } |
226 |
|
|
|
227 |
|
|
// ----------------------------------------------------------------------------- |
228 |
|
|
// vmaxvq* |
229 |
|
|
// ----------------------------------------------------------------------------- |
230 |
|
|
|
231 |
|
86 |
static inline int8_t vmaxvq(int8x16_t src) { return vmaxvq_s8(src); } |
232 |
|
86 |
static inline uint8_t vmaxvq(uint8x16_t src) { return vmaxvq_u8(src); } |
233 |
|
86 |
static inline int16_t vmaxvq(int16x8_t src) { return vmaxvq_s16(src); } |
234 |
|
86 |
static inline uint16_t vmaxvq(uint16x8_t src) { return vmaxvq_u16(src); } |
235 |
|
86 |
static inline int32_t vmaxvq(int32x4_t src) { return vmaxvq_s32(src); } |
236 |
|
|
static inline uint32_t vmaxvq(uint32x4_t src) { return vmaxvq_u32(src); } |
237 |
|
88 |
static inline float32_t vmaxvq(float32x4_t src) { return vmaxvq_f32(src); } |
238 |
|
|
|
239 |
|
|
// ----------------------------------------------------------------------------- |
240 |
|
|
// vcleq* |
241 |
|
|
// ----------------------------------------------------------------------------- |
242 |
|
|
|
243 |
|
1828 |
static inline uint32x4_t vcleq(float32x4_t lhs, float32x4_t rhs) { return vcleq_f32(lhs, rhs); } |
244 |
|
|
|
245 |
|
|
// ----------------------------------------------------------------------------- |
246 |
|
|
// vrshrn_n* |
247 |
|
|
// ----------------------------------------------------------------------------- |
248 |
|
|
|
249 |
|
|
template <int n> static inline int8x8_t vrshrn_n(int16x8_t vec) { return vrshrn_n_s16(vec, n); } |
250 |
|
176 |
template <int n> static inline uint8x8_t vrshrn_n(uint16x8_t vec) { return vrshrn_n_u16(vec, n); } |
251 |
|
|
template <int n> static inline int16x4_t vrshrn_n(int32x4_t vec) { return vrshrn_n_s32(vec, n); } |
252 |
|
|
template <int n> static inline uint16x4_t vrshrn_n(uint32x4_t vec) { return vrshrn_n_u32(vec, n); } |
253 |
|
|
template <int n> static inline int32x2_t vrshrn_n(int64x2_t vec) { return vrshrn_n_s64(vec, n); } |
254 |
|
|
template <int n> static inline uint32x2_t vrshrn_n(uint64x2_t vec) { return vrshrn_n_u64(vec, n); } |
255 |
|
|
|
256 |
|
|
// ----------------------------------------------------------------------------- |
257 |
|
|
// vshrq_n* |
258 |
|
|
// ----------------------------------------------------------------------------- |
259 |
|
|
|
260 |
|
|
template <int n> static inline int8x16_t vshrq_n(int8x16_t vec) { return vshrq_n_s8(vec, n); } |
261 |
|
116460 |
template <int n> static inline uint8x16_t vshrq_n(uint8x16_t vec) { return vshrq_n_u8(vec, n); } |
262 |
|
|
template <int n> static inline int16x8_t vshrq_n(int16x8_t vec) { return vshrq_n_s16(vec, n); } |
263 |
|
|
template <int n> static inline uint16x8_t vshrq_n(uint16x8_t vec) { return vshrq_n_u16(vec, n); } |
264 |
|
|
template <int n> static inline int32x4_t vshrq_n(int32x4_t vec) { return vshrq_n_s32(vec, n); } |
265 |
|
|
template <int n> static inline uint32x4_t vshrq_n(uint32x4_t vec) { return vshrq_n_u32(vec, n); } |
266 |
|
|
template <int n> static inline int64x2_t vshrq_n(int64x2_t vec) { return vshrq_n_s64(vec, n); } |
267 |
|
|
template <int n> static inline uint64x2_t vshrq_n(uint64x2_t vec) { return vshrq_n_u64(vec, n); } |
268 |
|
|
|
269 |
|
|
// ----------------------------------------------------------------------------- |
270 |
|
|
|
271 |
|
|
// ----------------------------------------------------------------------------- |
272 |
|
|
// vshll_n* |
273 |
|
|
// ----------------------------------------------------------------------------- |
274 |
|
|
|
275 |
|
|
template <int n> static inline int16x8_t vshll_n(int8x8_t vec) { return vshll_n_s8(vec, n); } |
276 |
|
228 |
template <int n> static inline uint16x8_t vshll_n(uint8x8_t vec) { return vshll_n_u8(vec, n); } |
277 |
|
|
template <int n> static inline int32x4_t vshll_n(int16x4_t vec) { return vshll_n_s16(vec, n); } |
278 |
|
|
template <int n> static inline uint32x4_t vshll_n(uint16x4_t vec) { return vshll_n_u16(vec, n); } |
279 |
|
|
template <int n> static inline int64x2_t vshll_n(int32x2_t vec) { return vshll_n_s32(vec, n); } |
280 |
|
|
template <int n> static inline uint64x2_t vshll_n(uint32x2_t vec) { return vshll_n_u32(vec, n); } |
281 |
|
|
|
282 |
|
|
// ----------------------------------------------------------------------------- |
283 |
|
|
// vshlq_n* |
284 |
|
|
// ----------------------------------------------------------------------------- |
285 |
|
|
|
286 |
|
|
template <int n> static inline int8x16_t vshlq_n(int8x16_t vec) { return vshlq_n_s8(vec, n); } |
287 |
|
|
template <int n> static inline uint8x16_t vshlq_n(uint8x16_t vec) { return vshlq_n_u8(vec, n); } |
288 |
|
|
template <int n> static inline int16x8_t vshlq_n(int16x8_t vec) { return vshlq_n_s16(vec, n); } |
289 |
|
176 |
template <int n> static inline uint16x8_t vshlq_n(uint16x8_t vec) { return vshlq_n_u16(vec, n); } |
290 |
|
|
template <int n> static inline int32x4_t vshlq_n(int32x4_t vec) { return vshlq_n_s32(vec, n); } |
291 |
|
|
template <int n> static inline uint32x4_t vshlq_n(uint32x4_t vec) { return vshlq_n_u32(vec, n); } |
292 |
|
|
template <int n> static inline int64x2_t vshlq_n(int64x2_t vec) { return vshlq_n_s64(vec, n); } |
293 |
|
|
template <int n> static inline uint64x2_t vshlq_n(uint64x2_t vec) { return vshlq_n_u64(vec, n); } |
294 |
|
|
|
295 |
|
|
// ----------------------------------------------------------------------------- |
296 |
|
|
// vdupq* |
297 |
|
|
// ----------------------------------------------------------------------------- |
298 |
|
|
|
299 |
|
296 |
static inline int8x16_t vdupq_n(int8_t src) { return vdupq_n_s8(src); } |
300 |
|
8644 |
static inline uint8x16_t vdupq_n(uint8_t src) { return vdupq_n_u8(src); } |
301 |
|
296 |
static inline int16x8_t vdupq_n(int16_t src) { return vdupq_n_s16(src); } |
302 |
|
296 |
static inline uint16x8_t vdupq_n(uint16_t src) { return vdupq_n_u16(src); } |
303 |
|
5823 |
static inline int32x4_t vdupq_n(int32_t src) { return vdupq_n_s32(src); } |
304 |
|
2624 |
static inline uint32x4_t vdupq_n(uint32_t src) { return vdupq_n_u32(src); } |
305 |
|
|
static inline int64x2_t vdupq_n(int64_t src) { return vdupq_n_s64(src); } |
306 |
|
|
static inline uint64x2_t vdupq_n(uint64_t src) { return vdupq_n_u64(src); } |
307 |
|
43437 |
static inline float32x4_t vdupq_n(float32_t src) { return vdupq_n_f32(src); } |
308 |
|
|
|
309 |
|
|
// ----------------------------------------------------------------------------- |
310 |
|
|
// vmull* |
311 |
|
|
// ----------------------------------------------------------------------------- |
312 |
|
|
|
313 |
|
496 |
static inline int16x8_t vmull(int8x8_t lhs, int8x8_t rhs) { return vmull_s8(lhs, rhs); } |
314 |
|
456 |
static inline uint16x8_t vmull(uint8x8_t lhs, uint8x8_t rhs) { return vmull_u8(lhs, rhs); } |
315 |
|
952 |
static inline int32x4_t vmull(int16x4_t lhs, int16x4_t rhs) { return vmull_s16(lhs, rhs); } |
316 |
|
914 |
static inline uint32x4_t vmull(uint16x4_t lhs, uint16x4_t rhs) { return vmull_u16(lhs, rhs); } |
317 |
|
1846 |
static inline int64x2_t vmull(int32x2_t lhs, int32x2_t rhs) { return vmull_s32(lhs, rhs); } |
318 |
|
|
|
319 |
|
|
// ----------------------------------------------------------------------------- |
320 |
|
|
// vmull_high* |
321 |
|
|
// ----------------------------------------------------------------------------- |
322 |
|
|
|
323 |
|
496 |
static inline int16x8_t vmull_high(int8x16_t lhs, int8x16_t rhs) { return vmull_high_s8(lhs, rhs); } |
324 |
|
456 |
static inline uint16x8_t vmull_high(uint8x16_t lhs, uint8x16_t rhs) { return vmull_high_u8(lhs, rhs); } |
325 |
|
952 |
static inline int32x4_t vmull_high(int16x8_t lhs, int16x8_t rhs) { return vmull_high_s16(lhs, rhs); } |
326 |
|
914 |
static inline uint32x4_t vmull_high(uint16x8_t lhs, uint16x8_t rhs) { return vmull_high_u16(lhs, rhs); } |
327 |
|
1846 |
static inline int64x2_t vmull_high(int32x4_t lhs, int32x4_t rhs) { return vmull_high_s32(lhs, rhs); } |
328 |
|
|
|
329 |
|
|
// ----------------------------------------------------------------------------- |
330 |
|
|
// vqmovn* |
331 |
|
|
// ----------------------------------------------------------------------------- |
332 |
|
|
|
333 |
|
1447 |
static inline int8x8_t vqmovn(int16x8_t src) { return vqmovn_s16(src); } |
334 |
|
1407 |
static inline uint8x8_t vqmovn(uint16x8_t src) { return vqmovn_u16(src); } |
335 |
|
2854 |
static inline int16x4_t vqmovn(int32x4_t src) { return vqmovn_s32(src); } |
336 |
|
2816 |
static inline uint16x4_t vqmovn(uint32x4_t src) { return vqmovn_u32(src); } |
337 |
|
1846 |
static inline int32x2_t vqmovn(int64x2_t src) { return vqmovn_s64(src); } |
338 |
|
|
|
339 |
|
|
// ----------------------------------------------------------------------------- |
340 |
|
|
// vqmovn_high* |
341 |
|
|
// ----------------------------------------------------------------------------- |
342 |
|
|
|
343 |
|
496 |
static inline int8x16_t vqmovn_high(int8x8_t low, int16x8_t src) { return vqmovn_high_s16(low, src); } |
344 |
|
456 |
static inline uint8x16_t vqmovn_high(uint8x8_t low, uint16x8_t src) { return vqmovn_high_u16(low, src); } |
345 |
|
952 |
static inline int16x8_t vqmovn_high(int16x4_t low, int32x4_t src) { return vqmovn_high_s32(low, src); } |
346 |
|
914 |
static inline uint16x8_t vqmovn_high(uint16x4_t low, uint32x4_t src) { return vqmovn_high_u32(low, src); } |
347 |
|
1846 |
static inline int32x4_t vqmovn_high(int32x2_t low, int64x2_t src) { return vqmovn_high_s64(low, src); } |
348 |
|
|
|
349 |
|
|
// ----------------------------------------------------------------------------- |
350 |
|
|
// NEON load operations |
351 |
|
|
// ----------------------------------------------------------------------------- |
352 |
|
|
|
353 |
|
57020 |
static inline int8x16_t vld1q(const int8_t *src) { return vld1q_s8(src); } |
354 |
|
1118577 |
static inline uint8x16_t vld1q(const uint8_t *src) { return vld1q_u8(src); } |
355 |
|
151919 |
static inline int16x8_t vld1q(const int16_t *src) { return vld1q_s16(src); } |
356 |
|
1086959 |
static inline uint16x8_t vld1q(const uint16_t *src) { return vld1q_u16(src); } |
357 |
|
205189 |
static inline int32x4_t vld1q(const int32_t *src) { return vld1q_s32(src); } |
358 |
|
2047048 |
static inline uint32x4_t vld1q(const uint32_t *src) { return vld1q_u32(src); } |
359 |
|
15120 |
static inline int64x2_t vld1q(const int64_t *src) { return vld1q_s64(src); } |
360 |
|
3700952 |
static inline uint64x2_t vld1q(const uint64_t *src) { return vld1q_u64(src); } |
361 |
|
|
static inline float16x8_t vld1q(const float16_t *src) { return vld1q_f16(src); } |
362 |
|
310866 |
static inline float32x4_t vld1q(const float32_t *src) { return vld1q_f32(src); } |
363 |
|
|
static inline float64x2_t vld1q(const float64_t *src) { return vld1q_f64(src); } |
364 |
|
|
|
365 |
|
|
static inline int8x16x2_t vld2q(const int8_t *src) { return vld2q_s8(src); } |
366 |
|
|
static inline uint8x16x2_t vld2q(const uint8_t *src) { return vld2q_u8(src); } |
367 |
|
|
static inline int16x8x2_t vld2q(const int16_t *src) { return vld2q_s16(src); } |
368 |
|
|
static inline uint16x8x2_t vld2q(const uint16_t *src) { return vld2q_u16(src); } |
369 |
|
|
static inline int32x4x2_t vld2q(const int32_t *src) { return vld2q_s32(src); } |
370 |
|
|
static inline uint32x4x2_t vld2q(const uint32_t *src) { return vld2q_u32(src); } |
371 |
|
|
static inline int64x2x2_t vld2q(const int64_t *src) { return vld2q_s64(src); } |
372 |
|
|
static inline uint64x2x2_t vld2q(const uint64_t *src) { return vld2q_u64(src); } |
373 |
|
|
static inline float32x4x2_t vld2q(const float32_t *src) { return vld2q_f32(src); } |
374 |
|
|
|
375 |
|
|
static inline int8x16x3_t vld3q(const int8_t *src) { return vld3q_s8(src); } |
376 |
|
|
static inline uint8x16x3_t vld3q(const uint8_t *src) { return vld3q_u8(src); } |
377 |
|
|
static inline int16x8x3_t vld3q(const int16_t *src) { return vld3q_s16(src); } |
378 |
|
|
static inline uint16x8x3_t vld3q(const uint16_t *src) { return vld3q_u16(src); } |
379 |
|
|
static inline int32x4x3_t vld3q(const int32_t *src) { return vld3q_s32(src); } |
380 |
|
|
static inline uint32x4x3_t vld3q(const uint32_t *src) { return vld3q_u32(src); } |
381 |
|
|
static inline int64x2x3_t vld3q(const int64_t *src) { return vld3q_s64(src); } |
382 |
|
|
static inline uint64x2x3_t vld3q(const uint64_t *src) { return vld3q_u64(src); } |
383 |
|
|
static inline float32x4x3_t vld3q(const float32_t *src) { return vld3q_f32(src); } |
384 |
|
|
|
385 |
|
|
static inline int8x16x4_t vld4q(const int8_t *src) { return vld4q_s8(src); } |
386 |
|
|
static inline uint8x16x4_t vld4q(const uint8_t *src) { return vld4q_u8(src); } |
387 |
|
|
static inline int16x8x4_t vld4q(const int16_t *src) { return vld4q_s16(src); } |
388 |
|
|
static inline uint16x8x4_t vld4q(const uint16_t *src) { return vld4q_u16(src); } |
389 |
|
|
static inline int32x4x4_t vld4q(const int32_t *src) { return vld4q_s32(src); } |
390 |
|
|
static inline uint32x4x4_t vld4q(const uint32_t *src) { return vld4q_u32(src); } |
391 |
|
|
static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); } |
392 |
|
|
static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); } |
393 |
|
|
static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); } |
394 |
|
|
|
395 |
|
|
// ----------------------------------------------------------------------------- |
396 |
|
|
// NEON store operations |
397 |
|
|
// ----------------------------------------------------------------------------- |
398 |
|
|
|
399 |
|
951 |
static inline void vst1(int8_t *dst, int8x8_t vec) { vst1_s8(dst, vec); } |
400 |
|
2071 |
static inline void vst1(uint8_t *dst, uint8x8_t vec) { vst1_u8(dst, vec); } |
401 |
|
|
static inline void vst1(int16_t *dst, int16x4_t vec) { vst1_s16(dst, vec); } |
402 |
|
|
static inline void vst1(uint16_t *dst, uint16x4_t vec) { vst1_u16(dst, vec); } |
403 |
|
|
static inline void vst1(int32_t *dst, int32x2_t vec) { vst1_s32(dst, vec); } |
404 |
|
|
static inline void vst1(uint32_t *dst, uint32x2_t vec) { vst1_u32(dst, vec); } |
405 |
|
|
static inline void vst1(int64_t *dst, int64x1_t vec) { vst1_s64(dst, vec); } |
406 |
|
|
static inline void vst1(uint64_t *dst, uint64x1_t vec) { vst1_u64(dst, vec); } |
407 |
|
|
|
408 |
|
3884 |
static inline void vst1q(int8_t *dst, int8x16_t vec) { vst1q_s8(dst, vec); } |
409 |
|
504261 |
static inline void vst1q(uint8_t *dst, uint8x16_t vec) { vst1q_u8(dst, vec); } |
410 |
|
22204 |
static inline void vst1q(int16_t *dst, int16x8_t vec) { vst1q_s16(dst, vec); } |
411 |
|
941930 |
static inline void vst1q(uint16_t *dst, uint16x8_t vec) { vst1q_u16(dst, vec); } |
412 |
|
13430 |
static inline void vst1q(int32_t *dst, int32x4_t vec) { vst1q_s32(dst, vec); } |
413 |
|
1855000 |
static inline void vst1q(uint32_t *dst, uint32x4_t vec) { vst1q_u32(dst, vec); } |
414 |
|
7560 |
static inline void vst1q(int64_t *dst, int64x2_t vec) { vst1q_s64(dst, vec); } |
415 |
|
3688560 |
static inline void vst1q(uint64_t *dst, uint64x2_t vec) { vst1q_u64(dst, vec); } |
416 |
|
100370 |
static inline void vst1q(float32_t *dst, float32x4_t vec) { vst1q_f32(dst, vec); } |
417 |
|
|
|
418 |
|
|
static inline void vst2q(int8_t *dst, int8x16x2_t vec) { vst2q_s8(dst, vec); } |
419 |
|
|
static inline void vst2q(uint8_t *dst, uint8x16x2_t vec) { vst2q_u8(dst, vec); } |
420 |
|
|
static inline void vst2q(int16_t *dst, int16x8x2_t vec) { vst2q_s16(dst, vec); } |
421 |
|
|
static inline void vst2q(uint16_t *dst, uint16x8x2_t vec) { vst2q_u16(dst, vec); } |
422 |
|
|
static inline void vst2q(int32_t *dst, int32x4x2_t vec) { vst2q_s32(dst, vec); } |
423 |
|
|
static inline void vst2q(uint32_t *dst, uint32x4x2_t vec) { vst2q_u32(dst, vec); } |
424 |
|
|
static inline void vst2q(int64_t *dst, int64x2x2_t vec) { vst2q_s64(dst, vec); } |
425 |
|
|
static inline void vst2q(uint64_t *dst, uint64x2x2_t vec) { vst2q_u64(dst, vec); } |
426 |
|
|
static inline void vst2q(float32_t *dst, float32x4x2_t vec) { vst2q_f32(dst, vec); } |
427 |
|
|
|
428 |
|
|
static inline void vst3q(int8_t *dst, int8x16x3_t vec) { vst3q_s8(dst, vec); } |
429 |
|
|
static inline void vst3q(uint8_t *dst, uint8x16x3_t vec) { vst3q_u8(dst, vec); } |
430 |
|
|
static inline void vst3q(int16_t *dst, int16x8x3_t vec) { vst3q_s16(dst, vec); } |
431 |
|
|
static inline void vst3q(uint16_t *dst, uint16x8x3_t vec) { vst3q_u16(dst, vec); } |
432 |
|
|
static inline void vst3q(int32_t *dst, int32x4x3_t vec) { vst3q_s32(dst, vec); } |
433 |
|
|
static inline void vst3q(uint32_t *dst, uint32x4x3_t vec) { vst3q_u32(dst, vec); } |
434 |
|
|
static inline void vst3q(int64_t *dst, int64x2x3_t vec) { vst3q_s64(dst, vec); } |
435 |
|
|
static inline void vst3q(uint64_t *dst, uint64x2x3_t vec) { vst3q_u64(dst, vec); } |
436 |
|
|
static inline void vst3q(float32_t *dst, float32x4x3_t vec) { vst3q_f32(dst, vec); } |
437 |
|
|
|
438 |
|
|
static inline void vst4q(int8_t *dst, int8x16x4_t vec) { vst4q_s8(dst, vec); } |
439 |
|
|
static inline void vst4q(uint8_t *dst, uint8x16x4_t vec) { vst4q_u8(dst, vec); } |
440 |
|
|
static inline void vst4q(int16_t *dst, int16x8x4_t vec) { vst4q_s16(dst, vec); } |
441 |
|
|
static inline void vst4q(uint16_t *dst, uint16x8x4_t vec) { vst4q_u16(dst, vec); } |
442 |
|
|
static inline void vst4q(int32_t *dst, int32x4x4_t vec) { vst4q_s32(dst, vec); } |
443 |
|
|
static inline void vst4q(uint32_t *dst, uint32x4x4_t vec) { vst4q_u32(dst, vec); } |
444 |
|
|
static inline void vst4q(int64_t *dst, int64x2x4_t vec) { vst4q_s64(dst, vec); } |
445 |
|
|
static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); } |
446 |
|
|
static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); } |
447 |
|
|
|
448 |
|
|
// ----------------------------------------------------------------------------- |
449 |
|
|
// vreinterpret* |
450 |
|
|
// ----------------------------------------------------------------------------- |
451 |
|
|
|
452 |
|
|
static inline uint8x16_t vreinterpretq_u8(int8x16_t vec) { return vreinterpretq_u8_s8(vec); } |
453 |
|
1296 |
static inline uint8x16_t vreinterpretq_u8(uint8x16_t vec) { return vec; } |
454 |
|
22400 |
static inline uint8x16_t vreinterpretq_u8(int16x8_t vec) { return vreinterpretq_u8_s16(vec); } |
455 |
|
44496 |
static inline uint8x16_t vreinterpretq_u8(uint16x8_t vec) { return vreinterpretq_u8_u16(vec); } |
456 |
|
|
static inline uint8x16_t vreinterpretq_u8(int32x4_t vec) { return vreinterpretq_u8_s32(vec); } |
457 |
|
|
static inline uint8x16_t vreinterpretq_u8(uint32x4_t vec) { return vreinterpretq_u8_u32(vec); } |
458 |
|
|
static inline uint8x16_t vreinterpretq_u8(int64x2_t vec) { return vreinterpretq_u8_s64(vec); } |
459 |
|
|
static inline uint8x16_t vreinterpretq_u8(uint64x2_t vec) { return vreinterpretq_u8_u64(vec); } |
460 |
|
|
|
461 |
|
|
static inline uint64x2_t vreinterpretq_u64(int8x16_t vec) { return vreinterpretq_u64_s8(vec); } |
462 |
|
460340 |
static inline uint64x2_t vreinterpretq_u64(uint8x16_t vec) { return vreinterpretq_u64_u8(vec); } |
463 |
|
|
static inline uint64x2_t vreinterpretq_u64(int16x8_t vec) { return vreinterpretq_u64_s16(vec); } |
464 |
|
920448 |
static inline uint64x2_t vreinterpretq_u64(uint16x8_t vec) { return vreinterpretq_u64_u16(vec); } |
465 |
|
|
static inline uint64x2_t vreinterpretq_u64(int32x4_t vec) { return vreinterpretq_u64_s32(vec); } |
466 |
|
1839744 |
static inline uint64x2_t vreinterpretq_u64(uint32x4_t vec) { return vreinterpretq_u64_u32(vec); } |
467 |
|
|
static inline uint64x2_t vreinterpretq_u64(int64x2_t vec) { return vreinterpretq_u64_s64(vec); } |
468 |
|
3678336 |
static inline uint64x2_t vreinterpretq_u64(uint64x2_t vec) { return vec; } |
469 |
|
|
|
470 |
|
|
// ----------------------------------------------------------------------------- |
471 |
|
|
// vcombine* |
472 |
|
|
// ----------------------------------------------------------------------------- |
473 |
|
|
|
474 |
|
|
static inline int8x16_t vcombine(int8x8_t lhs, int8x8_t rhs) { return vcombine_s8(lhs, rhs); } |
475 |
|
39296 |
static inline uint8x16_t vcombine(uint8x8_t lhs, uint8x8_t rhs) { return vcombine_u8(lhs, rhs); } |
476 |
|
951 |
static inline int16x8_t vcombine(int16x4_t lhs, int16x4_t rhs) { return vcombine_s16(lhs, rhs); } |
477 |
|
77623 |
static inline uint16x8_t vcombine(uint16x4_t lhs, uint16x4_t rhs) { return vcombine_u16(lhs, rhs); } |
478 |
|
|
static inline int32x4_t vcombine(int32x2_t lhs, int32x2_t rhs) { return vcombine_s32(lhs, rhs); } |
479 |
|
|
static inline uint32x4_t vcombine(uint32x2_t lhs, uint32x2_t rhs) { return vcombine_u32(lhs, rhs); } |
480 |
|
|
static inline int64x2_t vcombine(int64x1_t lhs, int64x1_t rhs) { return vcombine_s64(lhs, rhs); } |
481 |
|
6898560 |
static inline uint64x2_t vcombine(uint64x1_t lhs, uint64x1_t rhs) { return vcombine_u64(lhs, rhs); } |
482 |
|
|
|
483 |
|
|
// ----------------------------------------------------------------------------- |
484 |
|
|
// vrev* |
485 |
|
|
// ----------------------------------------------------------------------------- |
486 |
|
|
|
487 |
|
|
static inline int8x16_t vrev64q(int8x16_t src) { return vrev64q_s8(src); } |
488 |
|
460032 |
static inline uint8x16_t vrev64q(uint8x16_t src) { return vrev64q_u8(src); } |
489 |
|
|
static inline int16x8_t vrev64q(int16x8_t src) { return vrev64q_s16(src); } |
490 |
|
920448 |
static inline uint16x8_t vrev64q(uint16x8_t src) { return vrev64q_u16(src); } |
491 |
|
|
static inline int32x4_t vrev64q(int32x4_t src) { return vrev64q_s32(src); } |
492 |
|
1839744 |
static inline uint32x4_t vrev64q(uint32x4_t src) { return vrev64q_u32(src); } |
493 |
|
|
static inline int64x2_t vrev64q(int64x2_t src) { return src; } |
494 |
|
3678336 |
static inline uint64x2_t vrev64q(uint64x2_t src) { return src; } |
495 |
|
|
|
496 |
|
|
// ----------------------------------------------------------------------------- |
497 |
|
|
// vcvt* |
498 |
|
|
// ----------------------------------------------------------------------------- |
499 |
|
|
|
500 |
|
552 |
static inline float64x2_t vcvt_f64(float32x2_t vec) { return vcvt_f64_f32(vec); } |
501 |
|
|
|
502 |
|
|
// clang-format on |
503 |
|
|
|
504 |
|
|
} // namespace kleidicv::neon |
505 |
|
|
|
506 |
|
|
#endif // KLEIDICV_NEON_INTRINSICS_H |
507 |
|
|
|