Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | |||
7 | #include "kleidicv/arithmetics/rotate.h" | ||
8 | #include "kleidicv/kleidicv.h" | ||
9 | #include "kleidicv/neon.h" | ||
10 | |||
11 | namespace kleidicv::neon { | ||
12 | |||
13 | template <const size_t BufferSize, const size_t Order, typename DstVectorType, | ||
14 | typename SrcType> | ||
15 | 3219216 | static void rotate_vectors_recursively(DstVectorType *dst_vectors, | |
16 | Rows<const SrcType> src_rows) { | ||
17 | // order is halved at every recursive call, once it is 2 the recursion should | ||
18 | // stop and the input data needs to be read. | ||
19 | if constexpr (Order == 2) { | ||
20 | KLEIDICV_FORCE_LOOP_UNROLL | ||
21 |
8/8✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 230016 times.
✓ Branch 2 taken 115056 times.
✓ Branch 3 taken 460224 times.
✓ Branch 4 taken 459936 times.
✓ Branch 5 taken 919872 times.
✓ Branch 6 taken 1839168 times.
✓ Branch 7 taken 1839168 times.
|
5892192 | for (size_t index = 0; index < BufferSize; index += Order) { |
22 | using SrcVectorType = typename VecTraits<SrcType>::VectorType; | ||
23 | 3449280 | SrcVectorType src_vector[2]; | |
24 | |||
25 | 3449280 | src_vector[0] = vld1q(&src_rows.at(index + 0)[0]); | |
26 | 3449280 | src_vector[1] = vld1q(&src_rows.at(index + 1)[0]); | |
27 | |||
28 | // If order is 2 then SrcVectorType is the same as DstVectorType | ||
29 | 3449280 | dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]); | |
30 | 3449280 | dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]); | |
31 | 3449280 | } | |
32 | } else { | ||
33 | // First the input for the current rotate stage, which is the output of | ||
34 | // the previous stage, is created. The previous stage rotates | ||
35 | // elements half the size of the current stage and its order is also half of | ||
36 | // the current one. | ||
37 | 776304 | half_element_width_t<DstVectorType> input[BufferSize]; | |
38 | 776304 | constexpr size_t previous_order = Order / 2; | |
39 | |||
40 | 776304 | rotate_vectors_recursively<BufferSize, previous_order>(input, src_rows); | |
41 | |||
42 | 776304 | constexpr size_t half_order = Order / 2; | |
43 | |||
44 | KLEIDICV_FORCE_LOOP_UNROLL | ||
45 |
12/12✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 28752 times.
✓ Branch 2 taken 28752 times.
✓ Branch 3 taken 57504 times.
✓ Branch 4 taken 28752 times.
✓ Branch 5 taken 115008 times.
✓ Branch 6 taken 115056 times.
✓ Branch 7 taken 115056 times.
✓ Branch 8 taken 115056 times.
✓ Branch 9 taken 230112 times.
✓ Branch 10 taken 459936 times.
✓ Branch 11 taken 459936 times.
|
1782672 | for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) { |
46 | KLEIDICV_FORCE_LOOP_UNROLL | ||
47 |
12/12✓ Branch 0 taken 230016 times.
✓ Branch 1 taken 28752 times.
✓ Branch 2 taken 230016 times.
✓ Branch 3 taken 57504 times.
✓ Branch 4 taken 230016 times.
✓ Branch 5 taken 115008 times.
✓ Branch 6 taken 460224 times.
✓ Branch 7 taken 115056 times.
✓ Branch 8 taken 460224 times.
✓ Branch 9 taken 230112 times.
✓ Branch 10 taken 919872 times.
✓ Branch 11 taken 459936 times.
|
3536736 | for (size_t inner_i = 0; inner_i < half_order; ++inner_i) { |
48 | 2530368 | dst_vectors[outer_i + inner_i] = | |
49 | 5060736 | vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
50 | 2530368 | reinterpret_cast<DstVectorType>( | |
51 | 2530368 | input[outer_i + inner_i + half_order])); | |
52 | |||
53 | 2530368 | dst_vectors[outer_i + half_order + inner_i] = | |
54 | 5060736 | vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
55 | 2530368 | reinterpret_cast<DstVectorType>( | |
56 | 2530368 | input[outer_i + inner_i + half_order])); | |
57 | 2530368 | } | |
58 | 1006368 | } | |
59 | 776304 | } | |
60 | 3219216 | } | |
61 | |||
62 | // Rotates one tile of data with vector instructions. The tile's width and | ||
63 | // height are the number of Neon lanes for the given type. | ||
64 | template <typename ScalarType> | ||
65 | 2442912 | static void vector_path(Rows<const ScalarType> src_rows, | |
66 | Rows<ScalarType> dst_rows) { | ||
67 | 2442912 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
68 | using SrcVectorType = typename VecTraits<ScalarType>::VectorType; | ||
69 | |||
70 | // The number of vectors read and write is the same as the lane count of the | ||
71 | // given element size | ||
72 | 2442912 | constexpr size_t buffer_size = num_of_lanes; | |
73 | |||
74 | // Last rotate step is always done on 64 bit elements | ||
75 | 2442912 | uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays) | |
76 | |||
77 | // The 64 bit rotate spans through all the vectors, so its "order" is the | ||
78 | // same as the number of vectors | ||
79 | 2442912 | constexpr size_t rotate_order_b64 = num_of_lanes; | |
80 | |||
81 | 4885824 | rotate_vectors_recursively<buffer_size, rotate_order_b64>(trn_result_b64, | |
82 | 2442912 | src_rows); | |
83 | |||
84 | KLEIDICV_FORCE_LOOP_UNROLL | ||
85 |
8/8✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 460032 times.
✓ Branch 2 taken 115056 times.
✓ Branch 3 taken 920448 times.
✓ Branch 4 taken 459936 times.
✓ Branch 5 taken 1839744 times.
✓ Branch 6 taken 1839168 times.
✓ Branch 7 taken 3678336 times.
|
9341472 | for (size_t index = 0; index < buffer_size; ++index) { |
86 | 6898560 | trn_result_b64[index] = vreinterpretq_u64( | |
87 | 6898560 | vrev64q(reinterpret_cast<SrcVectorType>(trn_result_b64[index]))); | |
88 | 13797120 | trn_result_b64[index] = vcombine(vget_high(trn_result_b64[index]), | |
89 | 6898560 | vget_low(trn_result_b64[index])); | |
90 | 6898560 | vst1q(&dst_rows.at(index)[0], trn_result_b64[index]); | |
91 | 6898560 | } | |
92 | 2442912 | } | |
93 | |||
94 | template <typename ScalarType> | ||
95 | 66 | static void scalar_path(Rows<const ScalarType> src_rows, | |
96 | Rows<ScalarType> dst_rows, size_t height, | ||
97 | size_t width) { | ||
98 |
8/8✓ Branch 0 taken 39 times.
✓ Branch 1 taken 567 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 9 times.
✓ Branch 6 taken 9 times.
✓ Branch 7 taken 9 times.
|
660 | for (size_t vindex = 0; vindex < height; ++vindex) { |
99 | 594 | disable_loop_vectorization(); | |
100 |
8/8✓ Branch 0 taken 8820 times.
✓ Branch 1 taken 567 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 9 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 9 times.
|
9432 | for (size_t hindex = 0; hindex < width; ++hindex) { |
101 | 8838 | disable_loop_vectorization(); | |
102 | // dst[j][src_height - i - 1] = src[i][j] | ||
103 | 8838 | dst_rows.at(hindex)[height - vindex - 1] = src_rows.at(vindex)[hindex]; | |
104 | 8838 | } | |
105 | 594 | } | |
106 | 66 | } | |
107 | |||
108 | template <typename ScalarType> | ||
109 | 474 | static kleidicv_error_t rotate(Rectangle rect, Rows<const ScalarType> src_rows, | |
110 | Rows<ScalarType> dst_rows) { | ||
111 | 474 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
112 | 22674 | auto handle_lane_number_of_rows = [&](size_t vindex) { | |
113 | 22200 | LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes); | |
114 | |||
115 | 2465112 | horizontal_loop.unroll_once([&](size_t hindex) { | |
116 | // if the input is big enough handle it tile by tile | ||
117 | 2442912 | vector_path<ScalarType>( | |
118 | 2442912 | src_rows.at(vindex, hindex), | |
119 | 2442912 | dst_rows.at(hindex, rect.height() - vindex - num_of_lanes)); | |
120 | 2442912 | }); | |
121 | |||
122 | // This branch is needed even for TryToAvoidTailLoop | ||
123 | 22224 | horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
124 | 48 | scalar_path(src_rows.at(vindex, hindex), | |
125 | 24 | dst_rows.at(hindex, rect.height() - vindex - num_of_lanes), | |
126 | 24 | num_of_lanes, final_hindex - hindex); | |
127 | 24 | }); | |
128 | 22200 | }; | |
129 | |||
130 | 474 | LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes); | |
131 | |||
132 | 474 | vertical_loop.unroll_once(handle_lane_number_of_rows); | |
133 | |||
134 | 516 | vertical_loop.remaining([&](size_t vindex, size_t final_vindex) { | |
135 | 84 | scalar_path(src_rows.at(vindex), dst_rows.at(0, 0), final_vindex - vindex, | |
136 | 42 | rect.width()); | |
137 | 42 | }); | |
138 | 474 | return KLEIDICV_OK; | |
139 | 474 | } | |
140 | |||
141 | template <typename T> | ||
142 | 558 | static kleidicv_error_t rotate(const void *src_void, size_t src_stride, | |
143 | size_t src_width, size_t src_height, | ||
144 | void *dst_void, size_t dst_stride) { | ||
145 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 141 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 141 times.
|
558 | MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void); |
146 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 138 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 138 times.
|
549 | MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void); |
147 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 123 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 123 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 132 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 132 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 132 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 132 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 132 times.
|
540 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
148 |
16/16✓ Branch 0 taken 3 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 126 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 126 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 126 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 126 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 126 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 126 times.
|
519 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width); |
149 |
24/24✓ Branch 0 taken 3 times.
✓ Branch 1 taken 117 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 114 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 123 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 120 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 120 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 123 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 120 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 120 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 123 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 120 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 120 times.
|
498 | CHECK_IMAGE_SIZE(src_width, src_height); |
150 | |||
151 | 474 | Rectangle rect{src_width, src_height}; | |
152 | 474 | Rows<T> dst_rows{dst, dst_stride}; | |
153 | 474 | Rows<const T> src_rows{src, src_stride}; | |
154 | |||
155 | 474 | return rotate(rect, src_rows, dst_rows); | |
156 | 558 | } | |
157 | |||
158 | KLEIDICV_TARGET_FN_ATTRS | ||
159 | 567 | kleidicv_error_t rotate(const void *src, size_t src_stride, size_t src_width, | |
160 | size_t src_height, void *dst, size_t dst_stride, | ||
161 | int angle, size_t element_size) { | ||
162 |
2/2✓ Branch 0 taken 9 times.
✓ Branch 1 taken 558 times.
|
567 | if (!rotate_is_implemented(src, dst, angle, element_size)) { |
163 | 9 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
164 | } | ||
165 | |||
166 |
4/5✗ Branch 0 not taken.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 144 times.
✓ Branch 3 taken 144 times.
✓ Branch 4 taken 144 times.
|
558 | switch (element_size) { |
167 | case sizeof(uint8_t): | ||
168 | 252 | return rotate<uint8_t>(src, src_stride, src_width, src_height, dst, | |
169 | 126 | dst_stride); | |
170 | case sizeof(uint16_t): | ||
171 | 288 | return rotate<uint16_t>(src, src_stride, src_width, src_height, dst, | |
172 | 144 | dst_stride); | |
173 | case sizeof(uint32_t): | ||
174 | 288 | return rotate<uint32_t>(src, src_stride, src_width, src_height, dst, | |
175 | 144 | dst_stride); | |
176 | case sizeof(uint64_t): | ||
177 | 288 | return rotate<uint64_t>(src, src_stride, src_width, src_height, dst, | |
178 | 144 | dst_stride); | |
179 | // GCOVR_EXCL_START | ||
180 | default: | ||
181 | assert(!"element size not implemented"); | ||
182 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
183 | // GCOVR_EXCL_STOP | ||
184 | } | ||
185 | 567 | } | |
186 | |||
187 | } // namespace kleidicv::neon | ||
188 |