| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_NEON_INTRINSICS_H | ||
| 6 | #define KLEIDICV_NEON_INTRINSICS_H | ||
| 7 | |||
| 8 | #ifndef KLEIDICV_NEON_H | ||
| 9 | #error "Please include neon.h instead." | ||
| 10 | #endif | ||
| 11 | |||
| 12 | #include <arm_neon.h> | ||
| 13 | |||
| 14 | #include <cinttypes> | ||
| 15 | |||
| 16 | namespace kleidicv::neon { | ||
| 17 | |||
| 18 | // ----------------------------------------------------------------------------- | ||
| 19 | // NEON binary operations | ||
| 20 | // ----------------------------------------------------------------------------- | ||
| 21 | |||
| 22 | #define NEON_BINARY_OP_Q_B8_B16_B32_B64(name) \ | ||
| 23 | static inline int8x16_t name(int8x16_t lhs, int8x16_t rhs) { \ | ||
| 24 | return name##_s8(lhs, rhs); \ | ||
| 25 | } \ | ||
| 26 | \ | ||
| 27 | static inline uint8x16_t name(uint8x16_t lhs, uint8x16_t rhs) { \ | ||
| 28 | return name##_u8(lhs, rhs); \ | ||
| 29 | } \ | ||
| 30 | \ | ||
| 31 | static inline int16x8_t name(int16x8_t lhs, int16x8_t rhs) { \ | ||
| 32 | return name##_s16(lhs, rhs); \ | ||
| 33 | } \ | ||
| 34 | \ | ||
| 35 | static inline uint16x8_t name(uint16x8_t lhs, uint16x8_t rhs) { \ | ||
| 36 | return name##_u16(lhs, rhs); \ | ||
| 37 | } \ | ||
| 38 | \ | ||
| 39 | static inline int32x4_t name(int32x4_t lhs, int32x4_t rhs) { \ | ||
| 40 | return name##_s32(lhs, rhs); \ | ||
| 41 | } \ | ||
| 42 | \ | ||
| 43 | static inline uint32x4_t name(uint32x4_t lhs, uint32x4_t rhs) { \ | ||
| 44 | return name##_u32(lhs, rhs); \ | ||
| 45 | } \ | ||
| 46 | \ | ||
| 47 | static inline int64x2_t name(int64x2_t lhs, int64x2_t rhs) { \ | ||
| 48 | return name##_s64(lhs, rhs); \ | ||
| 49 | } \ | ||
| 50 | \ | ||
| 51 | static inline uint64x2_t name(uint64x2_t lhs, uint64x2_t rhs) { \ | ||
| 52 | return name##_u64(lhs, rhs); \ | ||
| 53 | } | ||
| 54 | |||
| 55 | // Alphabetical order | ||
| 56 | 1884848 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vaddq); | |
| 57 | 3092760 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vceqq); | |
| 58 | 1075 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vcleq); | |
| 59 | 667 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vcgeq); | |
| 60 | 14492 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vqaddq); | |
| 61 | 25654 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vqsubq); | |
| 62 | 2056704 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vsubq); | |
| 63 | 11428256 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn1q); | |
| 64 | 11428256 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vtrn2q); | |
| 65 | 11680 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp1q); | |
| 66 | 11680 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vuzp2q); | |
| 67 | 225520 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip1q); | |
| 68 | 225520 | NEON_BINARY_OP_Q_B8_B16_B32_B64(vzip2q); | |
| 69 | |||
| 70 | #undef NEON_BINARY_OP_Q_B8_B16_B32_B64 | ||
| 71 | |||
| 72 | #define NEON_BINARY_OP_Q_F32_F64(name) \ | ||
| 73 | static inline float32x4_t name(float32x4_t lhs, float32x4_t rhs) { \ | ||
| 74 | return name##_f32(lhs, rhs); \ | ||
| 75 | } \ | ||
| 76 | \ | ||
| 77 | static inline float64x2_t name(float64x2_t lhs, float64x2_t rhs) { \ | ||
| 78 | return name##_f64(lhs, rhs); \ | ||
| 79 | } | ||
| 80 | |||
| 81 | 552 | NEON_BINARY_OP_Q_F32_F64(vaddq); | |
| 82 | |||
| 83 | #undef NEON_BINARY_OP_Q_F32_F64 | ||
| 84 | |||
| 85 | // clang-format off | ||
| 86 | |||
| 87 | // ----------------------------------------------------------------------------- | ||
| 88 | // vaddv* | ||
| 89 | // ----------------------------------------------------------------------------- | ||
| 90 | |||
| 91 | static inline int8_t vaddvq(int8x16_t vec) { return vaddvq_s8(vec); } | ||
| 92 | static inline uint8_t vaddvq(uint8x16_t vec) { return vaddvq_u8(vec); } | ||
| 93 | static inline int16_t vaddvq(int16x8_t vec) { return vaddvq_s16(vec); } | ||
| 94 | static inline uint16_t vaddvq(uint16x8_t vec) { return vaddvq_u16(vec); } | ||
| 95 | static inline int32_t vaddvq(int32x4_t vec) { return vaddvq_s32(vec); } | ||
| 96 | static inline uint32_t vaddvq(uint32x4_t vec) { return vaddvq_u32(vec); } | ||
| 97 | static inline int64_t vaddvq(int64x2_t vec) { return vaddvq_s64(vec); } | ||
| 98 | static inline uint64_t vaddvq(uint64x2_t vec) { return vaddvq_u64(vec); } | ||
| 99 | static inline float32_t vaddvq(float32x4_t vec) { return vaddvq_f32(vec); } | ||
| 100 | 14 | static inline float64_t vaddvq(float64x2_t vec) { return vaddvq_f64(vec); } | |
| 101 | |||
| 102 | // ----------------------------------------------------------------------------- | ||
| 103 | // vabd* | ||
| 104 | // ----------------------------------------------------------------------------- | ||
| 105 | |||
| 106 | static inline int8x16_t vabdq(int8x16_t lhs, int8x16_t rhs) { return vabdq_s8(lhs, rhs); } | ||
| 107 | 460 | static inline uint8x16_t vabdq(uint8x16_t lhs, uint8x16_t rhs) { return vabdq_u8(lhs, rhs); } | |
| 108 | static inline int16x8_t vabdq(int16x8_t lhs, int16x8_t rhs) { return vabdq_s16(lhs, rhs); } | ||
| 109 | 918 | static inline uint16x8_t vabdq(uint16x8_t lhs, uint16x8_t rhs) { return vabdq_u16(lhs, rhs); } | |
| 110 | static inline int32x4_t vabdq(int32x4_t lhs, int32x4_t rhs) { return vabdq_s32(lhs, rhs); } | ||
| 111 | static inline uint32x4_t vabdq(uint32x4_t lhs, uint32x4_t rhs) { return vabdq_u32(lhs, rhs); } | ||
| 112 | |||
| 113 | // ----------------------------------------------------------------------------- | ||
| 114 | // vand* | ||
| 115 | // ----------------------------------------------------------------------------- | ||
| 116 | |||
| 117 | 920 | static inline uint8x16_t vandq(uint8x16_t lhs, uint8x16_t rhs) { return vandq_u8(lhs, rhs); } | |
| 118 | 40976 | static inline uint16x8_t vandq(uint16x8_t lhs, uint16x8_t rhs) { return vandq_u16(lhs, rhs); } | |
| 119 | 544900 | static inline uint32x4_t vandq(uint32x4_t lhs, uint32x4_t rhs) { return vandq_u32(lhs, rhs); } | |
| 120 | |||
| 121 | // ----------------------------------------------------------------------------- | ||
| 122 | // vqabs* | ||
| 123 | // ----------------------------------------------------------------------------- | ||
| 124 | |||
| 125 | 500 | static inline int8x16_t vqabsq(int8x16_t vec) { return vqabsq_s8(vec); } | |
| 126 | 958 | static inline int16x8_t vqabsq(int16x8_t vec) { return vqabsq_s16(vec); } | |
| 127 | 1852 | static inline int32x4_t vqabsq(int32x4_t vec) { return vqabsq_s32(vec); } | |
| 128 | static inline int64x2_t vqabsq(int64x2_t vec) { return vqabsq_s64(vec); } | ||
| 129 | |||
| 130 | // ----------------------------------------------------------------------------- | ||
| 131 | // vaddl* | ||
| 132 | // ----------------------------------------------------------------------------- | ||
| 133 | |||
| 134 | static inline int16x8_t vaddl(int8x8_t lhs, int8x8_t rhs) { return vaddl_s8(lhs, rhs); } | ||
| 135 | 228 | static inline uint16x8_t vaddl(uint8x8_t lhs, uint8x8_t rhs) { return vaddl_u8(lhs, rhs); } | |
| 136 | static inline int32x4_t vaddl(int16x4_t lhs, int16x4_t rhs) { return vaddl_s16(lhs, rhs); } | ||
| 137 | static inline uint32x4_t vaddl(uint16x4_t lhs, uint16x4_t rhs) { return vaddl_u16(lhs, rhs); } | ||
| 138 | static inline int64x2_t vaddl(int32x2_t lhs, int32x2_t rhs) { return vaddl_s32(lhs, rhs); } | ||
| 139 | static inline uint64x2_t vaddl(uint32x2_t lhs, uint32x2_t rhs) { return vaddl_u32(lhs, rhs); } | ||
| 140 | |||
| 141 | // ----------------------------------------------------------------------------- | ||
| 142 | // vbslq* | ||
| 143 | // ----------------------------------------------------------------------------- | ||
| 144 | |||
| 145 | static inline int8x16_t vbslq(int8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); } | ||
| 146 | static inline uint8x16_t vbslq(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); } | ||
| 147 | static inline int16x8_t vbslq(int16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); } | ||
| 148 | static inline uint16x8_t vbslq(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); } | ||
| 149 | static inline int32x4_t vbslq(int32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); } | ||
| 150 | static inline uint32x4_t vbslq(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); } | ||
| 151 | static inline float32x4_t vbslq(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); } | ||
| 152 | |||
| 153 | // ----------------------------------------------------------------------------- | ||
| 154 | // vget_high* | ||
| 155 | // ----------------------------------------------------------------------------- | ||
| 156 | |||
| 157 | static inline int8x8_t vget_high(int8x16_t vec) { return vget_high_s8(vec); } | ||
| 158 | 342 | static inline uint8x8_t vget_high(uint8x16_t vec) { return vget_high_u8(vec); } | |
| 159 | static inline int16x4_t vget_high(int16x8_t vec) { return vget_high_s16(vec); } | ||
| 160 | static inline uint16x4_t vget_high(uint16x8_t vec) { return vget_high_u16(vec); } | ||
| 161 | static inline int32x2_t vget_high(int32x4_t vec) { return vget_high_s32(vec); } | ||
| 162 | static inline uint32x2_t vget_high(uint32x4_t vec) { return vget_high_u32(vec); } | ||
| 163 | static inline int64x1_t vget_high(int64x2_t vec) { return vget_high_s64(vec); } | ||
| 164 | 13030272 | static inline uint64x1_t vget_high(uint64x2_t vec) { return vget_high_u64(vec); } | |
| 165 | static inline float16x4_t vget_high(float16x8_t vec) { return vget_high_f16(vec); } | ||
| 166 | 276 | static inline float32x2_t vget_high(float32x4_t vec) { return vget_high_f32(vec); } | |
| 167 | static inline float64x1_t vget_high(float64x2_t vec) { return vget_high_f64(vec); } | ||
| 168 | |||
| 169 | // ----------------------------------------------------------------------------- | ||
| 170 | // vcgeq* | ||
| 171 | // ----------------------------------------------------------------------------- | ||
| 172 | |||
| 173 | 1828 | static inline uint32x4_t vcgeq(float32x4_t lhs, float32x4_t rhs) { return vcgeq_f32(lhs, rhs); } | |
| 174 | |||
| 175 | // ----------------------------------------------------------------------------- | ||
| 176 | // vget_low* | ||
| 177 | // ----------------------------------------------------------------------------- | ||
| 178 | |||
| 179 | 992 | static inline int8x8_t vget_low(int8x16_t vec) { return vget_low_s8(vec); } | |
| 180 | 85606 | static inline uint8x8_t vget_low(uint8x16_t vec) { return vget_low_u8(vec); } | |
| 181 | 1904 | static inline int16x4_t vget_low(int16x8_t vec) { return vget_low_s16(vec); } | |
| 182 | 1828 | static inline uint16x4_t vget_low(uint16x8_t vec) { return vget_low_u16(vec); } | |
| 183 | 3692 | static inline int32x2_t vget_low(int32x4_t vec) { return vget_low_s32(vec); } | |
| 184 | static inline uint32x2_t vget_low(uint32x4_t vec) { return vget_low_u32(vec); } | ||
| 185 | static inline int64x1_t vget_low(int64x2_t vec) { return vget_low_s64(vec); } | ||
| 186 | 13030272 | static inline uint64x1_t vget_low(uint64x2_t vec) { return vget_low_u64(vec); } | |
| 187 | static inline float16x4_t vget_low(float16x8_t vec) { return vget_low_f16(vec); } | ||
| 188 | 276 | static inline float32x2_t vget_low(float32x4_t vec) { return vget_low_f32(vec); } | |
| 189 | static inline float64x1_t vget_low(float64x2_t vec) { return vget_low_f64(vec); } | ||
| 190 | |||
| 191 | // ----------------------------------------------------------------------------- | ||
| 192 | // vminq* | ||
| 193 | // ----------------------------------------------------------------------------- | ||
| 194 | |||
| 195 | 249264 | static inline int8x16_t vminq(int8x16_t lhs, int8x16_t rhs) { return vminq_s8(lhs, rhs); } | |
| 196 | 324642 | static inline uint8x16_t vminq(uint8x16_t lhs, uint8x16_t rhs) { return vminq_u8(lhs, rhs); } | |
| 197 | 603950 | static inline int16x8_t vminq(int16x8_t lhs, int16x8_t rhs) { return vminq_s16(lhs, rhs); } | |
| 198 | 603950 | static inline uint16x8_t vminq(uint16x8_t lhs, uint16x8_t rhs) { return vminq_u16(lhs, rhs); } | |
| 199 | 878032 | static inline int32x4_t vminq(int32x4_t lhs, int32x4_t rhs) { return vminq_s32(lhs, rhs); } | |
| 200 | 877324 | static inline uint32x4_t vminq(uint32x4_t lhs, uint32x4_t rhs) { return vminq_u32(lhs, rhs); } | |
| 201 | 1208628 | static inline float32x4_t vminq(float32x4_t lhs, float32x4_t rhs) { return vminq_f32(lhs, rhs); } | |
| 202 | |||
| 203 | // ----------------------------------------------------------------------------- | ||
| 204 | // vmaxq* | ||
| 205 | // ----------------------------------------------------------------------------- | ||
| 206 | |||
| 207 | 248536 | static inline int8x16_t vmaxq(int8x16_t lhs, int8x16_t rhs) { return vmaxq_s8(lhs, rhs); } | |
| 208 | 323710 | static inline uint8x16_t vmaxq(uint8x16_t lhs, uint8x16_t rhs) { return vmaxq_u8(lhs, rhs); } | |
| 209 | 602258 | static inline int16x8_t vmaxq(int16x8_t lhs, int16x8_t rhs) { return vmaxq_s16(lhs, rhs); } | |
| 210 | 602258 | static inline uint16x8_t vmaxq(uint16x8_t lhs, uint16x8_t rhs) { return vmaxq_u16(lhs, rhs); } | |
| 211 | 875484 | static inline int32x4_t vmaxq(int32x4_t lhs, int32x4_t rhs) { return vmaxq_s32(lhs, rhs); } | |
| 212 | 874776 | static inline uint32x4_t vmaxq(uint32x4_t lhs, uint32x4_t rhs) { return vmaxq_u32(lhs, rhs); } | |
| 213 | 1205196 | static inline float32x4_t vmaxq(float32x4_t lhs, float32x4_t rhs) { return vmaxq_f32(lhs, rhs); } | |
| 214 | |||
| 215 | // ----------------------------------------------------------------------------- | ||
| 216 | // vminvq* | ||
| 217 | // ----------------------------------------------------------------------------- | ||
| 218 | |||
| 219 | 86 | static inline int8_t vminvq(int8x16_t src) { return vminvq_s8(src); } | |
| 220 | 527 | static inline uint8_t vminvq(uint8x16_t src) { return vminvq_u8(src); } | |
| 221 | 86 | static inline int16_t vminvq(int16x8_t src) { return vminvq_s16(src); } | |
| 222 | 86 | static inline uint16_t vminvq(uint16x8_t src) { return vminvq_u16(src); } | |
| 223 | 86 | static inline int32_t vminvq(int32x4_t src) { return vminvq_s32(src); } | |
| 224 | static inline uint32_t vminvq(uint32x4_t src) { return vminvq_u32(src); } | ||
| 225 | 88 | static inline float32_t vminvq(float32x4_t src) { return vminvq_f32(src); } | |
| 226 | |||
| 227 | // ----------------------------------------------------------------------------- | ||
| 228 | // vmaxvq* | ||
| 229 | // ----------------------------------------------------------------------------- | ||
| 230 | |||
| 231 | 86 | static inline int8_t vmaxvq(int8x16_t src) { return vmaxvq_s8(src); } | |
| 232 | 86 | static inline uint8_t vmaxvq(uint8x16_t src) { return vmaxvq_u8(src); } | |
| 233 | 86 | static inline int16_t vmaxvq(int16x8_t src) { return vmaxvq_s16(src); } | |
| 234 | 86 | static inline uint16_t vmaxvq(uint16x8_t src) { return vmaxvq_u16(src); } | |
| 235 | 86 | static inline int32_t vmaxvq(int32x4_t src) { return vmaxvq_s32(src); } | |
| 236 | static inline uint32_t vmaxvq(uint32x4_t src) { return vmaxvq_u32(src); } | ||
| 237 | 88 | static inline float32_t vmaxvq(float32x4_t src) { return vmaxvq_f32(src); } | |
| 238 | |||
| 239 | // ----------------------------------------------------------------------------- | ||
| 240 | // vcleq* | ||
| 241 | // ----------------------------------------------------------------------------- | ||
| 242 | |||
| 243 | 1828 | static inline uint32x4_t vcleq(float32x4_t lhs, float32x4_t rhs) { return vcleq_f32(lhs, rhs); } | |
| 244 | |||
| 245 | // ----------------------------------------------------------------------------- | ||
| 246 | // vrshrn_n* | ||
| 247 | // ----------------------------------------------------------------------------- | ||
| 248 | |||
| 249 | template <int n> static inline int8x8_t vrshrn_n(int16x8_t vec) { return vrshrn_n_s16(vec, n); } | ||
| 250 | 176 | template <int n> static inline uint8x8_t vrshrn_n(uint16x8_t vec) { return vrshrn_n_u16(vec, n); } | |
| 251 | template <int n> static inline int16x4_t vrshrn_n(int32x4_t vec) { return vrshrn_n_s32(vec, n); } | ||
| 252 | template <int n> static inline uint16x4_t vrshrn_n(uint32x4_t vec) { return vrshrn_n_u32(vec, n); } | ||
| 253 | template <int n> static inline int32x2_t vrshrn_n(int64x2_t vec) { return vrshrn_n_s64(vec, n); } | ||
| 254 | template <int n> static inline uint32x2_t vrshrn_n(uint64x2_t vec) { return vrshrn_n_u64(vec, n); } | ||
| 255 | |||
| 256 | // ----------------------------------------------------------------------------- | ||
| 257 | // vshrq_n* | ||
| 258 | // ----------------------------------------------------------------------------- | ||
| 259 | |||
| 260 | template <int n> static inline int8x16_t vshrq_n(int8x16_t vec) { return vshrq_n_s8(vec, n); } | ||
| 261 | 155280 | template <int n> static inline uint8x16_t vshrq_n(uint8x16_t vec) { return vshrq_n_u8(vec, n); } | |
| 262 | template <int n> static inline int16x8_t vshrq_n(int16x8_t vec) { return vshrq_n_s16(vec, n); } | ||
| 263 | template <int n> static inline uint16x8_t vshrq_n(uint16x8_t vec) { return vshrq_n_u16(vec, n); } | ||
| 264 | template <int n> static inline int32x4_t vshrq_n(int32x4_t vec) { return vshrq_n_s32(vec, n); } | ||
| 265 | template <int n> static inline uint32x4_t vshrq_n(uint32x4_t vec) { return vshrq_n_u32(vec, n); } | ||
| 266 | template <int n> static inline int64x2_t vshrq_n(int64x2_t vec) { return vshrq_n_s64(vec, n); } | ||
| 267 | template <int n> static inline uint64x2_t vshrq_n(uint64x2_t vec) { return vshrq_n_u64(vec, n); } | ||
| 268 | |||
| 269 | // ----------------------------------------------------------------------------- | ||
| 270 | |||
| 271 | // ----------------------------------------------------------------------------- | ||
| 272 | // vshll_n* | ||
| 273 | // ----------------------------------------------------------------------------- | ||
| 274 | |||
| 275 | template <int n> static inline int16x8_t vshll_n(int8x8_t vec) { return vshll_n_s8(vec, n); } | ||
| 276 | 228 | template <int n> static inline uint16x8_t vshll_n(uint8x8_t vec) { return vshll_n_u8(vec, n); } | |
| 277 | template <int n> static inline int32x4_t vshll_n(int16x4_t vec) { return vshll_n_s16(vec, n); } | ||
| 278 | template <int n> static inline uint32x4_t vshll_n(uint16x4_t vec) { return vshll_n_u16(vec, n); } | ||
| 279 | template <int n> static inline int64x2_t vshll_n(int32x2_t vec) { return vshll_n_s32(vec, n); } | ||
| 280 | template <int n> static inline uint64x2_t vshll_n(uint32x2_t vec) { return vshll_n_u32(vec, n); } | ||
| 281 | |||
| 282 | // ----------------------------------------------------------------------------- | ||
| 283 | // vshlq_n* | ||
| 284 | // ----------------------------------------------------------------------------- | ||
| 285 | |||
| 286 | template <int n> static inline int8x16_t vshlq_n(int8x16_t vec) { return vshlq_n_s8(vec, n); } | ||
| 287 | template <int n> static inline uint8x16_t vshlq_n(uint8x16_t vec) { return vshlq_n_u8(vec, n); } | ||
| 288 | template <int n> static inline int16x8_t vshlq_n(int16x8_t vec) { return vshlq_n_s16(vec, n); } | ||
| 289 | 176 | template <int n> static inline uint16x8_t vshlq_n(uint16x8_t vec) { return vshlq_n_u16(vec, n); } | |
| 290 | template <int n> static inline int32x4_t vshlq_n(int32x4_t vec) { return vshlq_n_s32(vec, n); } | ||
| 291 | template <int n> static inline uint32x4_t vshlq_n(uint32x4_t vec) { return vshlq_n_u32(vec, n); } | ||
| 292 | template <int n> static inline int64x2_t vshlq_n(int64x2_t vec) { return vshlq_n_s64(vec, n); } | ||
| 293 | template <int n> static inline uint64x2_t vshlq_n(uint64x2_t vec) { return vshlq_n_u64(vec, n); } | ||
| 294 | |||
| 295 | // ----------------------------------------------------------------------------- | ||
| 296 | // vdupq* | ||
| 297 | // ----------------------------------------------------------------------------- | ||
| 298 | |||
| 299 | 296 | static inline int8x16_t vdupq_n(int8_t src) { return vdupq_n_s8(src); } | |
| 300 | 12768 | static inline uint8x16_t vdupq_n(uint8_t src) { return vdupq_n_u8(src); } | |
| 301 | 296 | static inline int16x8_t vdupq_n(int16_t src) { return vdupq_n_s16(src); } | |
| 302 | 296 | static inline uint16x8_t vdupq_n(uint16_t src) { return vdupq_n_u16(src); } | |
| 303 | 5800 | static inline int32x4_t vdupq_n(int32_t src) { return vdupq_n_s32(src); } | |
| 304 | 2601 | static inline uint32x4_t vdupq_n(uint32_t src) { return vdupq_n_u32(src); } | |
| 305 | static inline int64x2_t vdupq_n(int64_t src) { return vdupq_n_s64(src); } | ||
| 306 | static inline uint64x2_t vdupq_n(uint64_t src) { return vdupq_n_u64(src); } | ||
| 307 | 43391 | static inline float32x4_t vdupq_n(float32_t src) { return vdupq_n_f32(src); } | |
| 308 | |||
| 309 | // ----------------------------------------------------------------------------- | ||
| 310 | // vmull* | ||
| 311 | // ----------------------------------------------------------------------------- | ||
| 312 | |||
| 313 | 496 | static inline int16x8_t vmull(int8x8_t lhs, int8x8_t rhs) { return vmull_s8(lhs, rhs); } | |
| 314 | 456 | static inline uint16x8_t vmull(uint8x8_t lhs, uint8x8_t rhs) { return vmull_u8(lhs, rhs); } | |
| 315 | 952 | static inline int32x4_t vmull(int16x4_t lhs, int16x4_t rhs) { return vmull_s16(lhs, rhs); } | |
| 316 | 914 | static inline uint32x4_t vmull(uint16x4_t lhs, uint16x4_t rhs) { return vmull_u16(lhs, rhs); } | |
| 317 | 1846 | static inline int64x2_t vmull(int32x2_t lhs, int32x2_t rhs) { return vmull_s32(lhs, rhs); } | |
| 318 | |||
| 319 | // ----------------------------------------------------------------------------- | ||
| 320 | // vmull_high* | ||
| 321 | // ----------------------------------------------------------------------------- | ||
| 322 | |||
| 323 | 496 | static inline int16x8_t vmull_high(int8x16_t lhs, int8x16_t rhs) { return vmull_high_s8(lhs, rhs); } | |
| 324 | 456 | static inline uint16x8_t vmull_high(uint8x16_t lhs, uint8x16_t rhs) { return vmull_high_u8(lhs, rhs); } | |
| 325 | 952 | static inline int32x4_t vmull_high(int16x8_t lhs, int16x8_t rhs) { return vmull_high_s16(lhs, rhs); } | |
| 326 | 914 | static inline uint32x4_t vmull_high(uint16x8_t lhs, uint16x8_t rhs) { return vmull_high_u16(lhs, rhs); } | |
| 327 | 1846 | static inline int64x2_t vmull_high(int32x4_t lhs, int32x4_t rhs) { return vmull_high_s32(lhs, rhs); } | |
| 328 | |||
| 329 | // ----------------------------------------------------------------------------- | ||
| 330 | // vqmovn* | ||
| 331 | // ----------------------------------------------------------------------------- | ||
| 332 | |||
| 333 | 1447 | static inline int8x8_t vqmovn(int16x8_t src) { return vqmovn_s16(src); } | |
| 334 | 1407 | static inline uint8x8_t vqmovn(uint16x8_t src) { return vqmovn_u16(src); } | |
| 335 | 2854 | static inline int16x4_t vqmovn(int32x4_t src) { return vqmovn_s32(src); } | |
| 336 | 2816 | static inline uint16x4_t vqmovn(uint32x4_t src) { return vqmovn_u32(src); } | |
| 337 | 1846 | static inline int32x2_t vqmovn(int64x2_t src) { return vqmovn_s64(src); } | |
| 338 | |||
| 339 | // ----------------------------------------------------------------------------- | ||
| 340 | // vqmovn_high* | ||
| 341 | // ----------------------------------------------------------------------------- | ||
| 342 | |||
| 343 | 496 | static inline int8x16_t vqmovn_high(int8x8_t low, int16x8_t src) { return vqmovn_high_s16(low, src); } | |
| 344 | 456 | static inline uint8x16_t vqmovn_high(uint8x8_t low, uint16x8_t src) { return vqmovn_high_u16(low, src); } | |
| 345 | 952 | static inline int16x8_t vqmovn_high(int16x4_t low, int32x4_t src) { return vqmovn_high_s32(low, src); } | |
| 346 | 914 | static inline uint16x8_t vqmovn_high(uint16x4_t low, uint32x4_t src) { return vqmovn_high_u32(low, src); } | |
| 347 | 1846 | static inline int32x4_t vqmovn_high(int32x2_t low, int64x2_t src) { return vqmovn_high_s64(low, src); } | |
| 348 | |||
| 349 | // ----------------------------------------------------------------------------- | ||
| 350 | // NEON load operations | ||
| 351 | // ----------------------------------------------------------------------------- | ||
| 352 | |||
| 353 | 57020 | static inline int8x16_t vld1q(const int8_t *src) { return vld1q_s8(src); } | |
| 354 | 1948863 | static inline uint8x16_t vld1q(const uint8_t *src) { return vld1q_u8(src); } | |
| 355 | 151919 | static inline int16x8_t vld1q(const int16_t *src) { return vld1q_s16(src); } | |
| 356 | 1918511 | static inline uint16x8_t vld1q(const uint16_t *src) { return vld1q_u16(src); } | |
| 357 | 205189 | static inline int32x4_t vld1q(const int32_t *src) { return vld1q_s32(src); } | |
| 358 | 3690616 | static inline uint32x4_t vld1q(const uint32_t *src) { return vld1q_u32(src); } | |
| 359 | 15120 | static inline int64x2_t vld1q(const int64_t *src) { return vld1q_s64(src); } | |
| 360 | 6976256 | static inline uint64x2_t vld1q(const uint64_t *src) { return vld1q_u64(src); } | |
| 361 | static inline float16x8_t vld1q(const float16_t *src) { return vld1q_f16(src); } | ||
| 362 | 271820 | static inline float32x4_t vld1q(const float32_t *src) { return vld1q_f32(src); } | |
| 363 | static inline float64x2_t vld1q(const float64_t *src) { return vld1q_f64(src); } | ||
| 364 | |||
| 365 | static inline int8x16x2_t vld2q(const int8_t *src) { return vld2q_s8(src); } | ||
| 366 | static inline uint8x16x2_t vld2q(const uint8_t *src) { return vld2q_u8(src); } | ||
| 367 | static inline int16x8x2_t vld2q(const int16_t *src) { return vld2q_s16(src); } | ||
| 368 | static inline uint16x8x2_t vld2q(const uint16_t *src) { return vld2q_u16(src); } | ||
| 369 | static inline int32x4x2_t vld2q(const int32_t *src) { return vld2q_s32(src); } | ||
| 370 | static inline uint32x4x2_t vld2q(const uint32_t *src) { return vld2q_u32(src); } | ||
| 371 | static inline int64x2x2_t vld2q(const int64_t *src) { return vld2q_s64(src); } | ||
| 372 | static inline uint64x2x2_t vld2q(const uint64_t *src) { return vld2q_u64(src); } | ||
| 373 | static inline float16x8x2_t vld2q(const float16_t *src) { return vld2q_f16(src); } | ||
| 374 | static inline float32x4x2_t vld2q(const float32_t *src) { return vld2q_f32(src); } | ||
| 375 | static inline float64x2x2_t vld2q(const float64_t *src) { return vld2q_f64(src); } | ||
| 376 | |||
| 377 | static inline int8x16x3_t vld3q(const int8_t *src) { return vld3q_s8(src); } | ||
| 378 | static inline uint8x16x3_t vld3q(const uint8_t *src) { return vld3q_u8(src); } | ||
| 379 | static inline int16x8x3_t vld3q(const int16_t *src) { return vld3q_s16(src); } | ||
| 380 | static inline uint16x8x3_t vld3q(const uint16_t *src) { return vld3q_u16(src); } | ||
| 381 | static inline int32x4x3_t vld3q(const int32_t *src) { return vld3q_s32(src); } | ||
| 382 | static inline uint32x4x3_t vld3q(const uint32_t *src) { return vld3q_u32(src); } | ||
| 383 | static inline int64x2x3_t vld3q(const int64_t *src) { return vld3q_s64(src); } | ||
| 384 | static inline uint64x2x3_t vld3q(const uint64_t *src) { return vld3q_u64(src); } | ||
| 385 | static inline float16x8x3_t vld3q(const float16_t *src) { return vld3q_f16(src); } | ||
| 386 | static inline float32x4x3_t vld3q(const float32_t *src) { return vld3q_f32(src); } | ||
| 387 | static inline float64x2x3_t vld3q(const float64_t *src) { return vld3q_f64(src); } | ||
| 388 | |||
| 389 | static inline int8x16x4_t vld4q(const int8_t *src) { return vld4q_s8(src); } | ||
| 390 | static inline uint8x16x4_t vld4q(const uint8_t *src) { return vld4q_u8(src); } | ||
| 391 | static inline int16x8x4_t vld4q(const int16_t *src) { return vld4q_s16(src); } | ||
| 392 | static inline uint16x8x4_t vld4q(const uint16_t *src) { return vld4q_u16(src); } | ||
| 393 | static inline int32x4x4_t vld4q(const int32_t *src) { return vld4q_s32(src); } | ||
| 394 | static inline uint32x4x4_t vld4q(const uint32_t *src) { return vld4q_u32(src); } | ||
| 395 | static inline int64x2x4_t vld4q(const int64_t *src) { return vld4q_s64(src); } | ||
| 396 | static inline uint64x2x4_t vld4q(const uint64_t *src) { return vld4q_u64(src); } | ||
| 397 | static inline float16x8x4_t vld4q(const float16_t *src) { return vld4q_f16(src); } | ||
| 398 | static inline float32x4x4_t vld4q(const float32_t *src) { return vld4q_f32(src); } | ||
| 399 | static inline float64x2x4_t vld4q(const float64_t *src) { return vld4q_f64(src); } | ||
| 400 | |||
| 401 | // ----------------------------------------------------------------------------- | ||
| 402 | // NEON store operations | ||
| 403 | // ----------------------------------------------------------------------------- | ||
| 404 | |||
| 405 | 951 | static inline void vst1(int8_t *dst, int8x8_t vec) { vst1_s8(dst, vec); } | |
| 406 | 2071 | static inline void vst1(uint8_t *dst, uint8x8_t vec) { vst1_u8(dst, vec); } | |
| 407 | static inline void vst1(int16_t *dst, int16x4_t vec) { vst1_s16(dst, vec); } | ||
| 408 | static inline void vst1(uint16_t *dst, uint16x4_t vec) { vst1_u16(dst, vec); } | ||
| 409 | static inline void vst1(int32_t *dst, int32x2_t vec) { vst1_s32(dst, vec); } | ||
| 410 | static inline void vst1(uint32_t *dst, uint32x2_t vec) { vst1_u32(dst, vec); } | ||
| 411 | static inline void vst1(int64_t *dst, int64x1_t vec) { vst1_s64(dst, vec); } | ||
| 412 | static inline void vst1(uint64_t *dst, uint64x1_t vec) { vst1_u64(dst, vec); } | ||
| 413 | static inline void vst1(float16_t *dst, float16x4_t vec) { vst1_f16(dst, vec); } | ||
| 414 | static inline void vst1(float32_t *dst, float32x2_t vec) { vst1_f32(dst, vec); } | ||
| 415 | static inline void vst1(float64_t *dst, float64x1_t vec) { vst1_f64(dst, vec); } | ||
| 416 | |||
| 417 | 3884 | static inline void vst1q(int8_t *dst, int8x16_t vec) { vst1q_s8(dst, vec); } | |
| 418 | 935475 | static inline void vst1q(uint8_t *dst, uint8x16_t vec) { vst1q_u8(dst, vec); } | |
| 419 | 22204 | static inline void vst1q(int16_t *dst, int16x8_t vec) { vst1q_s16(dst, vec); } | |
| 420 | 1770122 | static inline void vst1q(uint16_t *dst, uint16x8_t vec) { vst1q_u16(dst, vec); } | |
| 421 | 13430 | static inline void vst1q(int32_t *dst, int32x4_t vec) { vst1q_s32(dst, vec); } | |
| 422 | 3495208 | static inline void vst1q(uint32_t *dst, uint32x4_t vec) { vst1q_u32(dst, vec); } | |
| 423 | 7560 | static inline void vst1q(int64_t *dst, int64x2_t vec) { vst1q_s64(dst, vec); } | |
| 424 | 6960504 | static inline void vst1q(uint64_t *dst, uint64x2_t vec) { vst1q_u64(dst, vec); } | |
| 425 | static inline void vst1q(float16_t *dst, float16x8_t vec) { vst1q_f16(dst, vec); } | ||
| 426 | 61324 | static inline void vst1q(float32_t *dst, float32x4_t vec) { vst1q_f32(dst, vec); } | |
| 427 | static inline void vst1q(float64_t *dst, float64x2_t vec) { vst1q_f64(dst, vec); } | ||
| 428 | |||
| 429 | static inline void vst2q(int8_t *dst, int8x16x2_t vec) { vst2q_s8(dst, vec); } | ||
| 430 | static inline void vst2q(uint8_t *dst, uint8x16x2_t vec) { vst2q_u8(dst, vec); } | ||
| 431 | static inline void vst2q(int16_t *dst, int16x8x2_t vec) { vst2q_s16(dst, vec); } | ||
| 432 | static inline void vst2q(uint16_t *dst, uint16x8x2_t vec) { vst2q_u16(dst, vec); } | ||
| 433 | static inline void vst2q(int32_t *dst, int32x4x2_t vec) { vst2q_s32(dst, vec); } | ||
| 434 | static inline void vst2q(uint32_t *dst, uint32x4x2_t vec) { vst2q_u32(dst, vec); } | ||
| 435 | static inline void vst2q(int64_t *dst, int64x2x2_t vec) { vst2q_s64(dst, vec); } | ||
| 436 | static inline void vst2q(uint64_t *dst, uint64x2x2_t vec) { vst2q_u64(dst, vec); } | ||
| 437 | static inline void vst2q(float16_t *dst, float16x8x2_t vec) { vst2q_f16(dst, vec); } | ||
| 438 | static inline void vst2q(float32_t *dst, float32x4x2_t vec) { vst2q_f32(dst, vec); } | ||
| 439 | static inline void vst2q(float64_t *dst, float64x2x2_t vec) { vst2q_f64(dst, vec); } | ||
| 440 | |||
| 441 | static inline void vst3q(int8_t *dst, int8x16x3_t vec) { vst3q_s8(dst, vec); } | ||
| 442 | static inline void vst3q(uint8_t *dst, uint8x16x3_t vec) { vst3q_u8(dst, vec); } | ||
| 443 | static inline void vst3q(int16_t *dst, int16x8x3_t vec) { vst3q_s16(dst, vec); } | ||
| 444 | static inline void vst3q(uint16_t *dst, uint16x8x3_t vec) { vst3q_u16(dst, vec); } | ||
| 445 | static inline void vst3q(int32_t *dst, int32x4x3_t vec) { vst3q_s32(dst, vec); } | ||
| 446 | static inline void vst3q(uint32_t *dst, uint32x4x3_t vec) { vst3q_u32(dst, vec); } | ||
| 447 | static inline void vst3q(int64_t *dst, int64x2x3_t vec) { vst3q_s64(dst, vec); } | ||
| 448 | static inline void vst3q(uint64_t *dst, uint64x2x3_t vec) { vst3q_u64(dst, vec); } | ||
| 449 | static inline void vst3q(float16_t *dst, float16x8x3_t vec) { vst3q_f16(dst, vec); } | ||
| 450 | static inline void vst3q(float32_t *dst, float32x4x3_t vec) { vst3q_f32(dst, vec); } | ||
| 451 | static inline void vst3q(float64_t *dst, float64x2x3_t vec) { vst3q_f64(dst, vec); } | ||
| 452 | |||
| 453 | static inline void vst4q(int8_t *dst, int8x16x4_t vec) { vst4q_s8(dst, vec); } | ||
| 454 | static inline void vst4q(uint8_t *dst, uint8x16x4_t vec) { vst4q_u8(dst, vec); } | ||
| 455 | static inline void vst4q(int16_t *dst, int16x8x4_t vec) { vst4q_s16(dst, vec); } | ||
| 456 | static inline void vst4q(uint16_t *dst, uint16x8x4_t vec) { vst4q_u16(dst, vec); } | ||
| 457 | static inline void vst4q(int32_t *dst, int32x4x4_t vec) { vst4q_s32(dst, vec); } | ||
| 458 | static inline void vst4q(uint32_t *dst, uint32x4x4_t vec) { vst4q_u32(dst, vec); } | ||
| 459 | static inline void vst4q(int64_t *dst, int64x2x4_t vec) { vst4q_s64(dst, vec); } | ||
| 460 | static inline void vst4q(uint64_t *dst, uint64x2x4_t vec) { vst4q_u64(dst, vec); } | ||
| 461 | static inline void vst4q(float16_t *dst, float16x8x4_t vec) { vst4q_f16(dst, vec); } | ||
| 462 | static inline void vst4q(float32_t *dst, float32x4x4_t vec) { vst4q_f32(dst, vec); } | ||
| 463 | static inline void vst4q(float64_t *dst, float64x2x4_t vec) { vst4q_f64(dst, vec); } | ||
| 464 | |||
| 465 | // ----------------------------------------------------------------------------- | ||
| 466 | // vreinterpret* | ||
| 467 | // ----------------------------------------------------------------------------- | ||
| 468 | |||
| 469 | static inline uint8x16_t vreinterpretq_u8(int8x16_t vec) { return vreinterpretq_u8_s8(vec); } | ||
| 470 | 2160 | static inline uint8x16_t vreinterpretq_u8(uint8x16_t vec) { return vec; } | |
| 471 | 22400 | static inline uint8x16_t vreinterpretq_u8(int16x8_t vec) { return vreinterpretq_u8_s16(vec); } | |
| 472 | 45360 | static inline uint8x16_t vreinterpretq_u8(uint16x8_t vec) { return vreinterpretq_u8_u16(vec); } | |
| 473 | static inline uint8x16_t vreinterpretq_u8(int32x4_t vec) { return vreinterpretq_u8_s32(vec); } | ||
| 474 | static inline uint8x16_t vreinterpretq_u8(uint32x4_t vec) { return vreinterpretq_u8_u32(vec); } | ||
| 475 | static inline uint8x16_t vreinterpretq_u8(int64x2_t vec) { return vreinterpretq_u8_s64(vec); } | ||
| 476 | static inline uint8x16_t vreinterpretq_u8(uint64x2_t vec) { return vreinterpretq_u8_u64(vec); } | ||
| 477 | |||
| 478 | static inline uint64x2_t vreinterpretq_u64(int8x16_t vec) { return vreinterpretq_u64_s8(vec); } | ||
| 479 | 869808 | static inline uint64x2_t vreinterpretq_u64(uint8x16_t vec) { return vreinterpretq_u64_u8(vec); } | |
| 480 | static inline uint64x2_t vreinterpretq_u64(int16x8_t vec) { return vreinterpretq_u64_s16(vec); } | ||
| 481 | 1738368 | static inline uint64x2_t vreinterpretq_u64(uint16x8_t vec) { return vreinterpretq_u64_u16(vec); } | |
| 482 | static inline uint64x2_t vreinterpretq_u64(int32x4_t vec) { return vreinterpretq_u64_s32(vec); } | ||
| 483 | 3474816 | static inline uint64x2_t vreinterpretq_u64(uint32x4_t vec) { return vreinterpretq_u64_u32(vec); } | |
| 484 | static inline uint64x2_t vreinterpretq_u64(int64x2_t vec) { return vreinterpretq_u64_s64(vec); } | ||
| 485 | 6947712 | static inline uint64x2_t vreinterpretq_u64(uint64x2_t vec) { return vec; } | |
| 486 | |||
| 487 | // ----------------------------------------------------------------------------- | ||
| 488 | // vcombine* | ||
| 489 | // ----------------------------------------------------------------------------- | ||
| 490 | |||
| 491 | static inline int8x16_t vcombine(int8x8_t lhs, int8x8_t rhs) { return vcombine_s8(lhs, rhs); } | ||
| 492 | 84352 | static inline uint8x16_t vcombine(uint8x8_t lhs, uint8x8_t rhs) { return vcombine_u8(lhs, rhs); } | |
| 493 | 951 | static inline int16x8_t vcombine(int16x4_t lhs, int16x4_t rhs) { return vcombine_s16(lhs, rhs); } | |
| 494 | 160055 | static inline uint16x8_t vcombine(uint16x4_t lhs, uint16x4_t rhs) { return vcombine_u16(lhs, rhs); } | |
| 495 | static inline int32x4_t vcombine(int32x2_t lhs, int32x2_t rhs) { return vcombine_s32(lhs, rhs); } | ||
| 496 | static inline uint32x4_t vcombine(uint32x2_t lhs, uint32x2_t rhs) { return vcombine_u32(lhs, rhs); } | ||
| 497 | static inline int64x2_t vcombine(int64x1_t lhs, int64x1_t rhs) { return vcombine_s64(lhs, rhs); } | ||
| 498 | 13030272 | static inline uint64x2_t vcombine(uint64x1_t lhs, uint64x1_t rhs) { return vcombine_u64(lhs, rhs); } | |
| 499 | |||
| 500 | // ----------------------------------------------------------------------------- | ||
| 501 | // vrev* | ||
| 502 | // ----------------------------------------------------------------------------- | ||
| 503 | |||
| 504 | static inline int8x16_t vrev64q(int8x16_t src) { return vrev64q_s8(src); } | ||
| 505 | 869376 | static inline uint8x16_t vrev64q(uint8x16_t src) { return vrev64q_u8(src); } | |
| 506 | static inline int16x8_t vrev64q(int16x8_t src) { return vrev64q_s16(src); } | ||
| 507 | 1738368 | static inline uint16x8_t vrev64q(uint16x8_t src) { return vrev64q_u16(src); } | |
| 508 | static inline int32x4_t vrev64q(int32x4_t src) { return vrev64q_s32(src); } | ||
| 509 | 3474816 | static inline uint32x4_t vrev64q(uint32x4_t src) { return vrev64q_u32(src); } | |
| 510 | static inline int64x2_t vrev64q(int64x2_t src) { return src; } | ||
| 511 | 6947712 | static inline uint64x2_t vrev64q(uint64x2_t src) { return src; } | |
| 512 | |||
| 513 | // ----------------------------------------------------------------------------- | ||
| 514 | // vcvt* | ||
| 515 | // ----------------------------------------------------------------------------- | ||
| 516 | |||
| 517 | 552 | static inline float64x2_t vcvt_f64(float32x2_t vec) { return vcvt_f64_f32(vec); } | |
| 518 | |||
| 519 | // clang-format on | ||
| 520 | |||
| 521 | } // namespace kleidicv::neon | ||
| 522 | |||
| 523 | #endif // KLEIDICV_NEON_INTRINSICS_H | ||
| 524 |