Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_NEON_H | ||
6 | #define KLEIDICV_NEON_H | ||
7 | |||
8 | #include <utility> | ||
9 | |||
10 | #include "kleidicv/neon_intrinsics.h" | ||
11 | #include "kleidicv/operations.h" | ||
12 | #include "kleidicv/utils.h" | ||
13 | |||
14 | namespace kleidicv::neon { | ||
15 | |||
16 | template <> | ||
17 | class half_element_width<uint16x8_t> { | ||
18 | public: | ||
19 | using type = uint8x16_t; | ||
20 | }; | ||
21 | |||
22 | template <> | ||
23 | class half_element_width<uint32x4_t> { | ||
24 | public: | ||
25 | using type = uint16x8_t; | ||
26 | }; | ||
27 | |||
28 | template <> | ||
29 | class half_element_width<uint64x2_t> { | ||
30 | public: | ||
31 | using type = uint32x4_t; | ||
32 | }; | ||
33 | |||
34 | template <> | ||
35 | class double_element_width<uint8x16_t> { | ||
36 | public: | ||
37 | using type = uint16x8_t; | ||
38 | }; | ||
39 | |||
40 | template <> | ||
41 | class double_element_width<uint16x8_t> { | ||
42 | public: | ||
43 | using type = uint32x4_t; | ||
44 | }; | ||
45 | |||
46 | template <> | ||
47 | class double_element_width<uint32x4_t> { | ||
48 | public: | ||
49 | using type = uint64x2_t; | ||
50 | }; | ||
51 | |||
52 | // Primary template to describe logically grouped peroperties of vectors. | ||
53 | template <typename ScalarType> | ||
54 | class VectorTypes; | ||
55 | |||
56 | template <> | ||
57 | class VectorTypes<int8_t> { | ||
58 | public: | ||
59 | using ScalarType = int8_t; | ||
60 | using VectorType = int8x16_t; | ||
61 | using Vector2Type = int8x16x2_t; | ||
62 | using Vector3Type = int8x16x3_t; | ||
63 | using Vector4Type = int8x16x4_t; | ||
64 | }; // end of class VectorTypes<int8_t> | ||
65 | |||
66 | template <> | ||
67 | class VectorTypes<uint8_t> { | ||
68 | public: | ||
69 | using ScalarType = uint8_t; | ||
70 | using VectorType = uint8x16_t; | ||
71 | using Vector2Type = uint8x16x2_t; | ||
72 | using Vector3Type = uint8x16x3_t; | ||
73 | using Vector4Type = uint8x16x4_t; | ||
74 | }; // end of class VectorTypes<uint8_t> | ||
75 | |||
76 | template <> | ||
77 | class VectorTypes<int16_t> { | ||
78 | public: | ||
79 | using ScalarType = int16_t; | ||
80 | using VectorType = int16x8_t; | ||
81 | using Vector2Type = int16x8x2_t; | ||
82 | using Vector3Type = int16x8x3_t; | ||
83 | using Vector4Type = int16x8x4_t; | ||
84 | }; // end of class VectorTypes<int16_t> | ||
85 | |||
86 | template <> | ||
87 | class VectorTypes<uint16_t> { | ||
88 | public: | ||
89 | using ScalarType = uint16_t; | ||
90 | using VectorType = uint16x8_t; | ||
91 | using Vector2Type = uint16x8x2_t; | ||
92 | using Vector3Type = uint16x8x3_t; | ||
93 | using Vector4Type = uint16x8x4_t; | ||
94 | }; // end of class VectorTypes<uint16_t> | ||
95 | |||
96 | template <> | ||
97 | class VectorTypes<int32_t> { | ||
98 | public: | ||
99 | using ScalarType = int32_t; | ||
100 | using VectorType = int32x4_t; | ||
101 | using Vector2Type = int32x4x2_t; | ||
102 | using Vector3Type = int32x4x3_t; | ||
103 | using Vector4Type = int32x4x4_t; | ||
104 | }; // end of class VectorTypes<int32_t> | ||
105 | |||
106 | template <> | ||
107 | class VectorTypes<uint32_t> { | ||
108 | public: | ||
109 | using ScalarType = uint32_t; | ||
110 | using VectorType = uint32x4_t; | ||
111 | using Vector2Type = uint32x4x2_t; | ||
112 | using Vector3Type = uint32x4x3_t; | ||
113 | using Vector4Type = uint32x4x4_t; | ||
114 | }; // end of class VectorTypes<uint32_t> | ||
115 | |||
116 | template <> | ||
117 | class VectorTypes<int64_t> { | ||
118 | public: | ||
119 | using ScalarType = int64_t; | ||
120 | using VectorType = int64x2_t; | ||
121 | using Vector2Type = int64x2x2_t; | ||
122 | using Vector3Type = int64x2x3_t; | ||
123 | using Vector4Type = int64x2x4_t; | ||
124 | }; // end of class VectorTypes<int64_t> | ||
125 | |||
126 | template <> | ||
127 | class VectorTypes<uint64_t> { | ||
128 | public: | ||
129 | using ScalarType = uint64_t; | ||
130 | using VectorType = uint64x2_t; | ||
131 | using Vector2Type = uint64x2x2_t; | ||
132 | using Vector3Type = uint64x2x3_t; | ||
133 | using Vector4Type = uint64x2x4_t; | ||
134 | }; // end of class VectorTypes<uint64_t> | ||
135 | |||
136 | template <> | ||
137 | class VectorTypes<float> { | ||
138 | public: | ||
139 | using ScalarType = float; | ||
140 | using VectorType = float32x4_t; | ||
141 | using Vector2Type = float32x4x2_t; | ||
142 | using Vector3Type = float32x4x3_t; | ||
143 | using Vector4Type = float32x4x4_t; | ||
144 | }; // end of class VectorTypes<float> | ||
145 | |||
146 | template <> | ||
147 | class VectorTypes<double> { | ||
148 | public: | ||
149 | using ScalarType = double; | ||
150 | using VectorType = float64x2_t; | ||
151 | using Vector2Type = float64x2x2_t; | ||
152 | using Vector3Type = float64x2x3_t; | ||
153 | using Vector4Type = float64x2x4_t; | ||
154 | }; // end of class VectorTypes<double> | ||
155 | |||
156 | // NEON vector length in bytes. | ||
157 | static constexpr size_t kVectorLength = 16; | ||
158 | |||
159 | // Base class for all NEON vector traits. | ||
160 | template <typename ScalarType> | ||
161 | class VecTraitsBase : public VectorTypes<ScalarType> { | ||
162 | public: | ||
163 | using typename VectorTypes<ScalarType>::VectorType; | ||
164 | using typename VectorTypes<ScalarType>::Vector2Type; | ||
165 | using typename VectorTypes<ScalarType>::Vector3Type; | ||
166 | using typename VectorTypes<ScalarType>::Vector4Type; | ||
167 | |||
168 | // Number of lanes in a vector. | ||
169 | 444265 | static constexpr size_t num_lanes() { | |
170 | 444265 | return kVectorLength / sizeof(ScalarType); | |
171 | } | ||
172 | |||
173 | // Maximum number of lanes in a vector. | ||
174 | static constexpr size_t max_num_lanes() { return num_lanes(); } | ||
175 | |||
176 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
177 | |||
178 | private: | ||
179 | static inline int8x16x2_t vld1q_x2(const int8_t *src) { | ||
180 | return vld1q_s8_x2(src); | ||
181 | } | ||
182 | |||
183 | 3214 | static inline uint8x16x2_t vld1q_x2(const uint8_t *src) { | |
184 | 3214 | return vld1q_u8_x2(src); | |
185 | } | ||
186 | |||
187 | 4728 | static inline int16x8x2_t vld1q_x2(const int16_t *src) { | |
188 | 4728 | return vld1q_s16_x2(src); | |
189 | } | ||
190 | |||
191 | 408 | static inline uint16x8x2_t vld1q_x2(const uint16_t *src) { | |
192 | 408 | return vld1q_u16_x2(src); | |
193 | } | ||
194 | |||
195 | static inline int32x4x2_t vld1q_x2(const int32_t *src) { | ||
196 | return vld1q_s32_x2(src); | ||
197 | } | ||
198 | |||
199 | 240 | static inline uint32x4x2_t vld1q_x2(const uint32_t *src) { | |
200 | 240 | return vld1q_u32_x2(src); | |
201 | } | ||
202 | |||
203 | static inline int64x2x2_t vld1q_x2(const int64_t *src) { | ||
204 | return vld1q_s64_x2(src); | ||
205 | } | ||
206 | |||
207 | 240 | static inline uint64x2x2_t vld1q_x2(const uint64_t *src) { | |
208 | 240 | return vld1q_u64_x2(src); | |
209 | } | ||
210 | |||
211 | static inline float32x4x2_t vld1q_x2(const float32_t *src) { | ||
212 | return vld1q_f32_x2(src); | ||
213 | } | ||
214 | |||
215 | static inline int8x16x3_t vld1q_x3(const int8_t *src) { | ||
216 | return vld1q_s8_x3(src); | ||
217 | } | ||
218 | |||
219 | 1688 | static inline uint8x16x3_t vld1q_x3(const uint8_t *src) { | |
220 | 1688 | return vld1q_u8_x3(src); | |
221 | } | ||
222 | |||
223 | static inline int16x8x3_t vld1q_x3(const int16_t *src) { | ||
224 | return vld1q_s16_x3(src); | ||
225 | } | ||
226 | |||
227 | 432 | static inline uint16x8x3_t vld1q_x3(const uint16_t *src) { | |
228 | 432 | return vld1q_u16_x3(src); | |
229 | } | ||
230 | |||
231 | static inline int32x4x3_t vld1q_x3(const int32_t *src) { | ||
232 | return vld1q_s32_x3(src); | ||
233 | } | ||
234 | |||
235 | 432 | static inline uint32x4x3_t vld1q_x3(const uint32_t *src) { | |
236 | 432 | return vld1q_u32_x3(src); | |
237 | } | ||
238 | |||
239 | static inline int64x2x3_t vld1q_x3(const int64_t *src) { | ||
240 | return vld1q_s64_x3(src); | ||
241 | } | ||
242 | |||
243 | 432 | static inline uint64x2x3_t vld1q_x3(const uint64_t *src) { | |
244 | 432 | return vld1q_u64_x3(src); | |
245 | } | ||
246 | |||
247 | static inline float32x4x3_t vld1q_x3(const float32_t *src) { | ||
248 | return vld1q_f32_x3(src); | ||
249 | } | ||
250 | |||
251 | 884 | static inline int8x16x4_t vld1q_x4(const int8_t *src) { | |
252 | 884 | return vld1q_s8_x4(src); | |
253 | } | ||
254 | |||
255 | 1436 | static inline uint8x16x4_t vld1q_x4(const uint8_t *src) { | |
256 | 1436 | return vld1q_u8_x4(src); | |
257 | } | ||
258 | |||
259 | static inline int16x8x4_t vld1q_x4(const int16_t *src) { | ||
260 | return vld1q_s16_x4(src); | ||
261 | } | ||
262 | |||
263 | 432 | static inline uint16x8x4_t vld1q_x4(const uint16_t *src) { | |
264 | 432 | return vld1q_u16_x4(src); | |
265 | } | ||
266 | |||
267 | static inline int32x4x4_t vld1q_x4(const int32_t *src) { | ||
268 | return vld1q_s32_x4(src); | ||
269 | } | ||
270 | |||
271 | 432 | static inline uint32x4x4_t vld1q_x4(const uint32_t *src) { | |
272 | 432 | return vld1q_u32_x4(src); | |
273 | } | ||
274 | |||
275 | static inline int64x2x4_t vld1q_x4(const int64_t *src) { | ||
276 | return vld1q_s64_x4(src); | ||
277 | } | ||
278 | |||
279 | 432 | static inline uint64x2x4_t vld1q_x4(const uint64_t *src) { | |
280 | 432 | return vld1q_u64_x4(src); | |
281 | } | ||
282 | |||
283 | 457 | static inline float32x4x4_t vld1q_x4(const float32_t *src) { | |
284 | 457 | return vld1q_f32_x4(src); | |
285 | } | ||
286 | |||
287 | static inline void vst1q_x2(int8_t *dst, int8x16x2_t vec) { | ||
288 | vst1q_s8_x2(dst, vec); | ||
289 | } | ||
290 | |||
291 | 6904 | static inline void vst1q_x2(uint8_t *dst, uint8x16x2_t vec) { | |
292 | 6904 | vst1q_u8_x2(dst, vec); | |
293 | 6904 | } | |
294 | |||
295 | static inline void vst1q_x2(int16_t *dst, int16x8x2_t vec) { | ||
296 | vst1q_s16_x2(dst, vec); | ||
297 | } | ||
298 | |||
299 | 1992 | static inline void vst1q_x2(uint16_t *dst, uint16x8x2_t vec) { | |
300 | 1992 | vst1q_u16_x2(dst, vec); | |
301 | 1992 | } | |
302 | |||
303 | static inline void vst1q_x2(int32_t *dst, int32x4x2_t vec) { | ||
304 | vst1q_s32_x2(dst, vec); | ||
305 | } | ||
306 | |||
307 | 1992 | static inline void vst1q_x2(uint32_t *dst, uint32x4x2_t vec) { | |
308 | 1992 | vst1q_u32_x2(dst, vec); | |
309 | 1992 | } | |
310 | |||
311 | static inline void vst1q_x2(int64_t *dst, int64x2x2_t vec) { | ||
312 | vst1q_s64_x2(dst, vec); | ||
313 | } | ||
314 | |||
315 | 1992 | static inline void vst1q_x2(uint64_t *dst, uint64x2x2_t vec) { | |
316 | 1992 | vst1q_u64_x2(dst, vec); | |
317 | 1992 | } | |
318 | |||
319 | 73024 | static inline void vst1q_x2(float32_t *dst, float32x4x2_t vec) { | |
320 | 73024 | vst1q_f32_x2(dst, vec); | |
321 | 73024 | } | |
322 | |||
323 | static inline void vst1q_x3(int8_t *dst, int8x16x3_t vec) { | ||
324 | vst1q_s8_x3(dst, vec); | ||
325 | } | ||
326 | |||
327 | 1304 | static inline void vst1q_x3(uint8_t *dst, uint8x16x3_t vec) { | |
328 | 1304 | vst1q_u8_x3(dst, vec); | |
329 | 1304 | } | |
330 | |||
331 | static inline void vst1q_x3(int16_t *dst, int16x8x3_t vec) { | ||
332 | vst1q_s16_x3(dst, vec); | ||
333 | } | ||
334 | |||
335 | 432 | static inline void vst1q_x3(uint16_t *dst, uint16x8x3_t vec) { | |
336 | 432 | vst1q_u16_x3(dst, vec); | |
337 | 432 | } | |
338 | |||
339 | static inline void vst1q_x3(int32_t *dst, int32x4x3_t vec) { | ||
340 | vst1q_s32_x3(dst, vec); | ||
341 | } | ||
342 | |||
343 | static inline void vst1q_x3(uint32_t *dst, uint32x4x3_t vec) { | ||
344 | vst1q_u32_x3(dst, vec); | ||
345 | } | ||
346 | |||
347 | static inline void vst1q_x3(int64_t *dst, int64x2x3_t vec) { | ||
348 | vst1q_s64_x3(dst, vec); | ||
349 | } | ||
350 | |||
351 | 432 | static inline void vst1q_x3(uint64_t *dst, uint64x2x3_t vec) { | |
352 | 432 | vst1q_u64_x3(dst, vec); | |
353 | 432 | } | |
354 | |||
355 | static inline void vst1q_x3(float32_t *dst, float32x4x3_t vec) { | ||
356 | vst1q_f32_x3(dst, vec); | ||
357 | } | ||
358 | |||
359 | static inline void vst1q_x4(int8_t *dst, int8x16x4_t vec) { | ||
360 | vst1q_s8_x4(dst, vec); | ||
361 | } | ||
362 | |||
363 | 442 | static inline void vst1q_x4(uint8_t *dst, uint8x16x4_t vec) { | |
364 | 442 | vst1q_u8_x4(dst, vec); | |
365 | 442 | } | |
366 | |||
367 | static inline void vst1q_x4(int16_t *dst, int16x8x4_t vec) { | ||
368 | vst1q_s16_x4(dst, vec); | ||
369 | } | ||
370 | |||
371 | 5608 | static inline void vst1q_x4(uint16_t *dst, uint16x8x4_t vec) { | |
372 | 5608 | vst1q_u16_x4(dst, vec); | |
373 | 5608 | } | |
374 | |||
375 | static inline void vst1q_x4(int32_t *dst, int32x4x4_t vec) { | ||
376 | vst1q_s32_x4(dst, vec); | ||
377 | } | ||
378 | |||
379 | 816 | static inline void vst1q_x4(uint32_t *dst, uint32x4x4_t vec) { | |
380 | 816 | vst1q_u32_x4(dst, vec); | |
381 | 816 | } | |
382 | |||
383 | static inline void vst1q_x4(int64_t *dst, int64x2x4_t vec) { | ||
384 | vst1q_s64_x4(dst, vec); | ||
385 | } | ||
386 | |||
387 | 1632 | static inline void vst1q_x4(uint64_t *dst, uint64x2x4_t vec) { | |
388 | 1632 | vst1q_u64_x4(dst, vec); | |
389 | 1632 | } | |
390 | |||
391 | 312 | static inline void vst1q_x4(float32_t *dst, float32x4x4_t vec) { | |
392 | 312 | vst1q_f32_x4(dst, vec); | |
393 | 312 | } | |
394 | |||
395 | public: | ||
396 | #endif | ||
397 | |||
398 | // Loads a single vector from 'src'. | ||
399 | 4622 | static inline void load(const ScalarType *src, VectorType &vec) { | |
400 | 4622 | vec = vld1q(&src[0]); | |
401 | 4622 | } | |
402 | |||
403 | // Loads two consecutive vectors from 'src'. | ||
404 | 7870 | static inline void load(const ScalarType *src, Vector2Type &vec) { | |
405 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
406 | 7870 | vec = vld1q_x2(&src[0]); | |
407 | #else | ||
408 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; | ||
409 | #endif | ||
410 | 7870 | } | |
411 | |||
412 | // Loads three consecutive vectors from 'src'. | ||
413 | 1256 | static inline void load(const ScalarType *src, Vector3Type &vec) { | |
414 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
415 | 1256 | vec = vld1q_x3(&src[0]); | |
416 | #else | ||
417 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
418 | vld1q(&src[0] + (2 * num_lanes()))}; | ||
419 | #endif | ||
420 | 1256 | } | |
421 | |||
422 | // Loads four consecutive vectors from 'src'. | ||
423 | 2345 | static inline void load(const ScalarType *src, Vector4Type &vec) { | |
424 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
425 | 2345 | vec = vld1q_x4(&src[0]); | |
426 | #else | ||
427 | vec = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
428 | vld1q(&src[0] + (2 * num_lanes())), | ||
429 | vld1q(&src[0] + (3 * num_lanes()))}; | ||
430 | #endif | ||
431 | 2345 | } | |
432 | |||
433 | // Loads two consecutive vectors from 'src'. | ||
434 | 304340 | static inline void load_consecutive(const ScalarType *src, VectorType &vec_0, | |
435 | VectorType &vec_1) { | ||
436 | 304340 | vec_0 = vld1q(&src[0]); | |
437 | 304340 | vec_1 = vld1q(&src[num_lanes()]); | |
438 | 304340 | } | |
439 | |||
440 | // Loads 2x2 consecutive vectors from 'src'. | ||
441 | 480 | static inline void load_consecutive(const ScalarType *src, Vector2Type &vec_0, | |
442 | Vector2Type &vec_1) { | ||
443 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
444 | 480 | vec_0 = vld1q_x2(&src[0]); | |
445 | 480 | vec_1 = vld1q_x2(&src[num_lanes() * 2]); | |
446 | #else | ||
447 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes())}; | ||
448 | vec_1 = {vld1q(&src[num_lanes() * 2]), | ||
449 | vld1q(&src[num_lanes() * 2] + num_lanes())}; | ||
450 | #endif | ||
451 | 480 | } | |
452 | |||
453 | // Loads 2x3 consecutive vectors from 'src'. | ||
454 | 864 | static inline void load_consecutive(const ScalarType *src, Vector3Type &vec_0, | |
455 | Vector3Type &vec_1) { | ||
456 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
457 | 864 | vec_0 = vld1q_x3(&src[0]); | |
458 | 864 | vec_1 = vld1q_x3(&src[num_lanes() * 3]); | |
459 | #else | ||
460 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
461 | vld1q(&src[0] + (2 * num_lanes()))}; | ||
462 | vec_1 = {vld1q(&src[num_lanes() * 3]), | ||
463 | vld1q(&src[num_lanes() * 3] + num_lanes()), | ||
464 | vld1q(&src[num_lanes() * 3] + (2 * num_lanes()))}; | ||
465 | #endif | ||
466 | 864 | } | |
467 | |||
468 | // Loads 2x4 consecutive vectors from 'src'. | ||
469 | 864 | static inline void load_consecutive(const ScalarType *src, Vector4Type &vec_0, | |
470 | Vector4Type &vec_1) { | ||
471 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
472 | 864 | vec_0 = vld1q_x4(&src[0]); | |
473 | 864 | vec_1 = vld1q_x4(&src[num_lanes() * 4]); | |
474 | |||
475 | #else | ||
476 | vec_0 = {vld1q(&src[0]), vld1q(&src[0] + num_lanes()), | ||
477 | vld1q(&src[0] + (2 * num_lanes())), | ||
478 | vld1q(&src[0] + (3 * num_lanes()))}; | ||
479 | vec_1 = {vld1q(&src[num_lanes() * 4]), | ||
480 | vld1q(&src[num_lanes() * 4] + num_lanes()), | ||
481 | vld1q(&src[num_lanes() * 4] + (2 * num_lanes())), | ||
482 | vld1q(&src[num_lanes() * 4] + (3 * num_lanes()))}; | ||
483 | #endif | ||
484 | 864 | } | |
485 | |||
486 | // Stores a single vector to 'dst'. | ||
487 | 11800 | static inline void store(VectorType vec, ScalarType *dst) { | |
488 | 11800 | vst1q(&dst[0], vec); | |
489 | 11800 | } | |
490 | |||
491 | // Stores two consecutive vectors to 'dst'. | ||
492 | 85904 | static inline void store(Vector2Type vec, ScalarType *dst) { | |
493 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
494 | 85904 | vst1q_x2(&dst[0], vec); | |
495 | #else | ||
496 | vst1q(&dst[0], vec.val[0]); | ||
497 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
498 | #endif | ||
499 | 85904 | } | |
500 | |||
501 | // Stores three consecutive vectors to 'dst'. | ||
502 | 2168 | static inline void store(Vector3Type vec, ScalarType *dst) { | |
503 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
504 | 2168 | vst1q_x3(&dst[0], vec); | |
505 | #else | ||
506 | vst1q(&dst[0], vec.val[0]); | ||
507 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
508 | vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); | ||
509 | #endif | ||
510 | 2168 | } | |
511 | |||
512 | // Stores four consecutive vectors to 'dst'. | ||
513 | 8810 | static inline void store(Vector4Type vec, ScalarType *dst) { | |
514 | #if KLEIDICV_NEON_USE_CONTINUOUS_MULTIVEC_LS | ||
515 | 8810 | vst1q_x4(&dst[0], vec); | |
516 | #else | ||
517 | vst1q(&dst[0], vec.val[0]); | ||
518 | vst1q(&dst[0] + num_lanes(), vec.val[1]); | ||
519 | vst1q(&dst[0] + (2 * num_lanes()), vec.val[2]); | ||
520 | vst1q(&dst[0] + (3 * num_lanes()), vec.val[3]); | ||
521 | #endif | ||
522 | 8810 | } | |
523 | |||
524 | // Stores two consecutive vectors to 'dst'. | ||
525 | 47670 | static inline void store_consecutive(VectorType vec_0, VectorType vec_1, | |
526 | ScalarType *dst) { | ||
527 | 47670 | vst1q(&dst[0], vec_0); | |
528 | 47670 | vst1q(&dst[num_lanes()], vec_1); | |
529 | 47670 | } | |
530 | }; // end of class VecTraitsBase<ScalarType> | ||
531 | |||
532 | // Available NEON vector traits. | ||
533 | template <typename ScalarType> | ||
534 | class VecTraits : public VecTraitsBase<ScalarType> {}; | ||
535 | |||
536 | // NEON has no associated context yet. | ||
537 | using NeonContextType = Monostate; | ||
538 | |||
539 | // Adapter which simply adds context and forwards all arguments. | ||
540 | template <typename OperationType> | ||
541 | class OperationContextAdapter : public OperationBase<OperationType> { | ||
542 | // Shorten rows: no need to write 'this->'. | ||
543 | using OperationBase<OperationType>::operation; | ||
544 | |||
545 | public: | ||
546 | using ContextType = NeonContextType; | ||
547 | |||
548 | 6782 | explicit OperationContextAdapter(OperationType &operation) | |
549 | 6782 | : OperationBase<OperationType>(operation) {} | |
550 | |||
551 | // Forwards vector_path_2x() calls to the inner operation. | ||
552 | template <typename... ArgTypes> | ||
553 | 280270 | void vector_path_2x(ArgTypes &&...args) { | |
554 | 280270 | operation().vector_path_2x(ContextType{}, std::forward<ArgTypes>(args)...); | |
555 | 280270 | } | |
556 | |||
557 | // Forwards vector_path() calls to the inner operation. | ||
558 | template <typename... ArgTypes> | ||
559 | 8156 | void vector_path(ArgTypes &&...args) { | |
560 | 8156 | operation().vector_path(ContextType{}, std::forward<ArgTypes>(args)...); | |
561 | 8156 | } | |
562 | |||
563 | // Forwards remaining_path() calls to the inner operation. | ||
564 | template <typename... ArgTypes> | ||
565 | 9188 | void remaining_path(ArgTypes &&...args) { | |
566 | 9188 | operation().remaining_path(ContextType{}, std::forward<ArgTypes>(args)...); | |
567 | 9188 | } | |
568 | }; // end of class OperationContextAdapter<OperationType> | ||
569 | |||
570 | // Adapter which implements remaining_path() for general NEON operations. | ||
571 | template <typename OperationType> | ||
572 | class RemainingPathAdapter : public OperationBase<OperationType> { | ||
573 | public: | ||
574 | using ContextType = NeonContextType; | ||
575 | |||
576 | 6478 | explicit RemainingPathAdapter(OperationType &operation) | |
577 | 6478 | : OperationBase<OperationType>(operation) {} | |
578 | |||
579 | // Forwards remaining_path() calls to scalar_path() of the inner operation | ||
580 | // element by element. | ||
581 | template <typename... ColumnTypes> | ||
582 | 8532 | void remaining_path(ContextType ctx, size_t length, ColumnTypes... columns) { | |
583 |
24/24✓ Branch 0 taken 1941 times.
✓ Branch 1 taken 30088 times.
✓ Branch 2 taken 984 times.
✓ Branch 3 taken 11947 times.
✓ Branch 4 taken 863 times.
✓ Branch 5 taken 4571 times.
✓ Branch 6 taken 1974 times.
✓ Branch 7 taken 7958 times.
✓ Branch 8 taken 702 times.
✓ Branch 9 taken 2289 times.
✓ Branch 10 taken 648 times.
✓ Branch 11 taken 1935 times.
✓ Branch 12 taken 282 times.
✓ Branch 13 taken 324 times.
✓ Branch 14 taken 298 times.
✓ Branch 15 taken 336 times.
✓ Branch 16 taken 282 times.
✓ Branch 17 taken 288 times.
✓ Branch 18 taken 90 times.
✓ Branch 19 taken 96 times.
✓ Branch 20 taken 186 times.
✓ Branch 21 taken 192 times.
✓ Branch 22 taken 282 times.
✓ Branch 23 taken 288 times.
|
68844 | for (size_t index = 0; index < length; ++index) { |
584 | 60312 | disable_loop_vectorization(); | |
585 | 60312 | this->operation().scalar_path(ctx, columns.at(index)...); | |
586 | 60312 | } | |
587 | 8532 | } | |
588 | }; // end of class RemainingPathAdapter<OperationType> | ||
589 | |||
590 | // Adapter which implements remaining_path() for NEON operations which | ||
591 | // implementation custom processing of remaining elements. | ||
592 | template <typename OperationType> | ||
593 | class RemainingPathToScalarPathAdapter : public OperationBase<OperationType> { | ||
594 | public: | ||
595 | using ContextType = NeonContextType; | ||
596 | |||
597 | 304 | explicit RemainingPathToScalarPathAdapter(OperationType &operation) | |
598 | 304 | : OperationBase<OperationType>(operation) {} | |
599 | |||
600 | // Forwards remaining_path() calls to scalar_path() of the inner operation. | ||
601 | template <typename... ArgTypes> | ||
602 | 656 | void remaining_path(ArgTypes &&...args) { | |
603 | 656 | this->operation().scalar_path(std::forward<ArgTypes>(args)...); | |
604 | 656 | } | |
605 | }; // end of class RemainingPathToScalarPathAdapter<OperationType> | ||
606 | |||
607 | // Shorthand for applying a generic unrolled NEON operation. | ||
608 | template <typename OperationType, typename... ArgTypes> | ||
609 | 6010 | void apply_operation_by_rows(OperationType &operation, ArgTypes &&...args) { | |
610 | 6010 | RemoveContextAdapter remove_context_adapter{operation}; | |
611 | 6010 | OperationAdapter operation_adapter{remove_context_adapter}; | |
612 | 6010 | RemainingPathAdapter remaining_path_adapter{operation_adapter}; | |
613 | 6010 | OperationContextAdapter context_adapter{remaining_path_adapter}; | |
614 | 6010 | RowBasedOperation row_based_operation{context_adapter}; | |
615 | 6010 | zip_rows(row_based_operation, std::forward<ArgTypes>(args)...); | |
616 | 6010 | } | |
617 | |||
618 | // Shorthand for applying a generic unrolled and block-based NEON operation. | ||
619 | template <typename OperationType, typename... ArgTypes> | ||
620 | 468 | void apply_block_operation_by_rows(OperationType &operation, | |
621 | ArgTypes &&...args) { | ||
622 | 468 | RemoveContextAdapter remove_context_adapter{operation}; | |
623 | 468 | OperationAdapter operation_adapter{remove_context_adapter}; | |
624 | 468 | RemainingPathAdapter remaining_path_adapter{operation_adapter}; | |
625 | 468 | OperationContextAdapter context_adapter{remaining_path_adapter}; | |
626 | 468 | RowBasedBlockOperation block_operation{context_adapter}; | |
627 | 468 | zip_rows(block_operation, std::forward<ArgTypes>(args)...); | |
628 | 468 | } | |
629 | |||
630 | } // namespace kleidicv::neon | ||
631 | |||
632 | #endif // KLEIDICV_NEON_H | ||
633 |