KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/rotate_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 97 97 100.0%
Functions: 40 43 93.0%
Branches: 130 131 99.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/arithmetics/rotate.h"
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10
11 namespace kleidicv::neon {
12
13 template <const size_t BufferSize, const size_t Order, typename DstVectorType,
14 typename SrcType>
15 6080496 static void rotate_vectors_recursively(DstVectorType *dst_vectors,
16 Rows<const SrcType> src_rows) {
17 // order is halved at every recursive call, once it is 2 the recursion should
18 // stop and the input data needs to be read.
19 if constexpr (Order == 2) {
20 KLEIDICV_FORCE_LOOP_UNROLL
21
8/8
✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 434688 times.
✓ Branch 2 taken 217296 times.
✓ Branch 3 taken 869184 times.
✓ Branch 4 taken 868704 times.
✓ Branch 5 taken 1737408 times.
✓ Branch 6 taken 3473856 times.
✓ Branch 7 taken 3473856 times.
11129328 for (size_t index = 0; index < BufferSize; index += Order) {
22 using SrcVectorType = typename VecTraits<SrcType>::VectorType;
23 6515136 SrcVectorType src_vector[2];
24
25 6515136 src_vector[0] = vld1q(&src_rows.at(index + 0)[0]);
26 6515136 src_vector[1] = vld1q(&src_rows.at(index + 1)[0]);
27
28 // If order is 2 then SrcVectorType is the same as DstVectorType
29 6515136 dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]);
30 6515136 dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]);
31 6515136 }
32 } else {
33 // First the input for the current rotate stage, which is the output of
34 // the previous stage, is created. The previous stage rotates
35 // elements half the size of the current stage and its order is also half of
36 // the current one.
37 1466304 half_element_width_t<DstVectorType> input[BufferSize];
38 1466304 constexpr size_t previous_order = Order / 2;
39
40 1466304 rotate_vectors_recursively<BufferSize, previous_order>(input, src_rows);
41
42 1466304 constexpr size_t half_order = Order / 2;
43
44 KLEIDICV_FORCE_LOOP_UNROLL
45
12/12
✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 54336 times.
✓ Branch 2 taken 54336 times.
✓ Branch 3 taken 108672 times.
✓ Branch 4 taken 54336 times.
✓ Branch 5 taken 217344 times.
✓ Branch 6 taken 217296 times.
✓ Branch 7 taken 217296 times.
✓ Branch 8 taken 217296 times.
✓ Branch 9 taken 434592 times.
✓ Branch 10 taken 868704 times.
✓ Branch 11 taken 868704 times.
3367248 for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) {
46 KLEIDICV_FORCE_LOOP_UNROLL
47
12/12
✓ Branch 0 taken 434688 times.
✓ Branch 1 taken 54336 times.
✓ Branch 2 taken 434688 times.
✓ Branch 3 taken 108672 times.
✓ Branch 4 taken 434688 times.
✓ Branch 5 taken 217344 times.
✓ Branch 6 taken 869184 times.
✓ Branch 7 taken 217296 times.
✓ Branch 8 taken 869184 times.
✓ Branch 9 taken 434592 times.
✓ Branch 10 taken 1737408 times.
✓ Branch 11 taken 868704 times.
6680784 for (size_t inner_i = 0; inner_i < half_order; ++inner_i) {
48 4779840 dst_vectors[outer_i + inner_i] =
49 9559680 vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
50 4779840 reinterpret_cast<DstVectorType>(
51 4779840 input[outer_i + inner_i + half_order]));
52
53 4779840 dst_vectors[outer_i + half_order + inner_i] =
54 9559680 vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
55 4779840 reinterpret_cast<DstVectorType>(
56 4779840 input[outer_i + inner_i + half_order]));
57 4779840 }
58 1900944 }
59 1466304 }
60 6080496 }
61
62 // Rotates one tile of data with vector instructions. The tile's width and
63 // height are the number of Neon lanes for the given type.
64 template <typename ScalarType>
65 4614192 static void vector_path(Rows<const ScalarType> src_rows,
66 Rows<ScalarType> dst_rows) {
67 4614192 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
68 using SrcVectorType = typename VecTraits<ScalarType>::VectorType;
69
70 // The number of vectors read and write is the same as the lane count of the
71 // given element size
72 4614192 constexpr size_t buffer_size = num_of_lanes;
73
74 // Last rotate step is always done on 64 bit elements
75 4614192 uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays)
76
77 // The 64 bit rotate spans through all the vectors, so its "order" is the
78 // same as the number of vectors
79 4614192 constexpr size_t rotate_order_b64 = num_of_lanes;
80
81 9228384 rotate_vectors_recursively<buffer_size, rotate_order_b64>(trn_result_b64,
82 4614192 src_rows);
83
84 KLEIDICV_FORCE_LOOP_UNROLL
85
8/8
✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 869376 times.
✓ Branch 2 taken 217296 times.
✓ Branch 3 taken 1738368 times.
✓ Branch 4 taken 868704 times.
✓ Branch 5 taken 3474816 times.
✓ Branch 6 taken 3473856 times.
✓ Branch 7 taken 6947712 times.
17644464 for (size_t index = 0; index < buffer_size; ++index) {
86 13030272 trn_result_b64[index] = vreinterpretq_u64(
87 13030272 vrev64q(reinterpret_cast<SrcVectorType>(trn_result_b64[index])));
88 26060544 trn_result_b64[index] = vcombine(vget_high(trn_result_b64[index]),
89 13030272 vget_low(trn_result_b64[index]));
90 13030272 vst1q(&dst_rows.at(index)[0], trn_result_b64[index]);
91 13030272 }
92 4614192 }
93
94 template <typename ScalarType>
95 76 static void scalar_path(Rows<const ScalarType> src_rows,
96 Rows<ScalarType> dst_rows, size_t height,
97 size_t width) {
98
8/8
✓ Branch 0 taken 40 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 12 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 12 times.
680 for (size_t vindex = 0; vindex < height; ++vindex) {
99 604 disable_loop_vectorization();
100
8/8
✓ Branch 0 taken 8820 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 12 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 12 times.
9448 for (size_t hindex = 0; hindex < width; ++hindex) {
101 8844 disable_loop_vectorization();
102 // dst[j][src_height - i - 1] = src[i][j]
103 8844 dst_rows.at(hindex)[height - vindex - 1] = src_rows.at(vindex)[hindex];
104 8844 }
105 604 }
106 76 }
107
108 template <typename ScalarType>
109 632 static kleidicv_error_t rotate(Rectangle rect, Rows<const ScalarType> src_rows,
110 Rows<ScalarType> dst_rows) {
111 632 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
112 37616 auto handle_lane_number_of_rows = [&](size_t vindex) {
113 36984 LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes);
114
115 4651176 horizontal_loop.unroll_once([&](size_t hindex) {
116 // if the input is big enough handle it tile by tile
117 4614192 vector_path<ScalarType>(
118 4614192 src_rows.at(vindex, hindex),
119 4614192 dst_rows.at(hindex, rect.height() - vindex - num_of_lanes));
120 4614192 });
121
122 // This branch is needed even for TryToAvoidTailLoop
123 37008 horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) {
124 48 scalar_path(src_rows.at(vindex, hindex),
125 24 dst_rows.at(hindex, rect.height() - vindex - num_of_lanes),
126 24 num_of_lanes, final_hindex - hindex);
127 24 });
128 36984 };
129
130 632 LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes);
131
132 632 vertical_loop.unroll_once(handle_lane_number_of_rows);
133
134 684 vertical_loop.remaining([&](size_t vindex, size_t final_vindex) {
135 104 scalar_path(src_rows.at(vindex), dst_rows.at(0, 0), final_vindex - vindex,
136 52 rect.width());
137 52 });
138 632 return KLEIDICV_OK;
139 632 }
140
141 template <typename T>
142 744 static kleidicv_error_t rotate(const void *src_void, size_t src_stride,
143 size_t src_width, size_t src_height,
144 void *dst_void, size_t dst_stride) {
145
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 188 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 188 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 188 times.
744 MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void);
146
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 184 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 184 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 184 times.
732 MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void);
147
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 164 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 164 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 176 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 176 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 176 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 176 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 176 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 176 times.
720 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
148
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 160 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 160 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 168 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 168 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 168 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 168 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 168 times.
692 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width);
149
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 152 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 152 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 164 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 160 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 160 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 164 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 160 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 160 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 164 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 160 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 160 times.
664 CHECK_IMAGE_SIZE(src_width, src_height);
150
151 632 Rectangle rect{src_width, src_height};
152 632 Rows<T> dst_rows{dst, dst_stride};
153 632 Rows<const T> src_rows{src, src_stride};
154
155 632 return rotate(rect, src_rows, dst_rows);
156 744 }
157
158 KLEIDICV_TARGET_FN_ATTRS
159 756 kleidicv_error_t rotate(const void *src, size_t src_stride, size_t src_width,
160 size_t src_height, void *dst, size_t dst_stride,
161 int angle, size_t element_size) {
162
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 744 times.
756 if (!rotate_is_implemented(src, dst, angle, element_size)) {
163 12 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
164 }
165
166
4/5
✗ Branch 0 not taken.
✓ Branch 1 taken 168 times.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 192 times.
✓ Branch 4 taken 192 times.
744 switch (element_size) {
167 case sizeof(uint8_t):
168 336 return rotate<uint8_t>(src, src_stride, src_width, src_height, dst,
169 168 dst_stride);
170 case sizeof(uint16_t):
171 384 return rotate<uint16_t>(src, src_stride, src_width, src_height, dst,
172 192 dst_stride);
173 case sizeof(uint32_t):
174 384 return rotate<uint32_t>(src, src_stride, src_width, src_height, dst,
175 192 dst_stride);
176 case sizeof(uint64_t):
177 384 return rotate<uint64_t>(src, src_stride, src_width, src_height, dst,
178 192 dst_stride);
179 // GCOVR_EXCL_START
180 default:
181 assert(!"element size not implemented");
182 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
183 // GCOVR_EXCL_STOP
184 }
185 756 }
186
187 } // namespace kleidicv::neon
188