KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/arithmetics/rotate_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 97 97 100.0%
Functions: 40 43 93.0%
Branches: 130 131 99.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6
7 #include "kleidicv/arithmetics/rotate.h"
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10
11 namespace kleidicv::neon {
12
13 template <const size_t BufferSize, const size_t Order, typename DstVectorType,
14 typename SrcType>
15 3219216 static void rotate_vectors_recursively(DstVectorType *dst_vectors,
16 Rows<const SrcType> src_rows) {
17 // order is halved at every recursive call, once it is 2 the recursion should
18 // stop and the input data needs to be read.
19 if constexpr (Order == 2) {
20 KLEIDICV_FORCE_LOOP_UNROLL
21
8/8
✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 230016 times.
✓ Branch 2 taken 115056 times.
✓ Branch 3 taken 460224 times.
✓ Branch 4 taken 459936 times.
✓ Branch 5 taken 919872 times.
✓ Branch 6 taken 1839168 times.
✓ Branch 7 taken 1839168 times.
5892192 for (size_t index = 0; index < BufferSize; index += Order) {
22 using SrcVectorType = typename VecTraits<SrcType>::VectorType;
23 3449280 SrcVectorType src_vector[2];
24
25 3449280 src_vector[0] = vld1q(&src_rows.at(index + 0)[0]);
26 3449280 src_vector[1] = vld1q(&src_rows.at(index + 1)[0]);
27
28 // If order is 2 then SrcVectorType is the same as DstVectorType
29 3449280 dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]);
30 3449280 dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]);
31 3449280 }
32 } else {
33 // First the input for the current rotate stage, which is the output of
34 // the previous stage, is created. The previous stage rotates
35 // elements half the size of the current stage and its order is also half of
36 // the current one.
37 776304 half_element_width_t<DstVectorType> input[BufferSize];
38 776304 constexpr size_t previous_order = Order / 2;
39
40 776304 rotate_vectors_recursively<BufferSize, previous_order>(input, src_rows);
41
42 776304 constexpr size_t half_order = Order / 2;
43
44 KLEIDICV_FORCE_LOOP_UNROLL
45
12/12
✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 28752 times.
✓ Branch 2 taken 28752 times.
✓ Branch 3 taken 57504 times.
✓ Branch 4 taken 28752 times.
✓ Branch 5 taken 115008 times.
✓ Branch 6 taken 115056 times.
✓ Branch 7 taken 115056 times.
✓ Branch 8 taken 115056 times.
✓ Branch 9 taken 230112 times.
✓ Branch 10 taken 459936 times.
✓ Branch 11 taken 459936 times.
1782672 for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) {
46 KLEIDICV_FORCE_LOOP_UNROLL
47
12/12
✓ Branch 0 taken 230016 times.
✓ Branch 1 taken 28752 times.
✓ Branch 2 taken 230016 times.
✓ Branch 3 taken 57504 times.
✓ Branch 4 taken 230016 times.
✓ Branch 5 taken 115008 times.
✓ Branch 6 taken 460224 times.
✓ Branch 7 taken 115056 times.
✓ Branch 8 taken 460224 times.
✓ Branch 9 taken 230112 times.
✓ Branch 10 taken 919872 times.
✓ Branch 11 taken 459936 times.
3536736 for (size_t inner_i = 0; inner_i < half_order; ++inner_i) {
48 2530368 dst_vectors[outer_i + inner_i] =
49 5060736 vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
50 2530368 reinterpret_cast<DstVectorType>(
51 2530368 input[outer_i + inner_i + half_order]));
52
53 2530368 dst_vectors[outer_i + half_order + inner_i] =
54 5060736 vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]),
55 2530368 reinterpret_cast<DstVectorType>(
56 2530368 input[outer_i + inner_i + half_order]));
57 2530368 }
58 1006368 }
59 776304 }
60 3219216 }
61
62 // Rotates one tile of data with vector instructions. The tile's width and
63 // height are the number of Neon lanes for the given type.
64 template <typename ScalarType>
65 2442912 static void vector_path(Rows<const ScalarType> src_rows,
66 Rows<ScalarType> dst_rows) {
67 2442912 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
68 using SrcVectorType = typename VecTraits<ScalarType>::VectorType;
69
70 // The number of vectors read and write is the same as the lane count of the
71 // given element size
72 2442912 constexpr size_t buffer_size = num_of_lanes;
73
74 // Last rotate step is always done on 64 bit elements
75 2442912 uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays)
76
77 // The 64 bit rotate spans through all the vectors, so its "order" is the
78 // same as the number of vectors
79 2442912 constexpr size_t rotate_order_b64 = num_of_lanes;
80
81 4885824 rotate_vectors_recursively<buffer_size, rotate_order_b64>(trn_result_b64,
82 2442912 src_rows);
83
84 KLEIDICV_FORCE_LOOP_UNROLL
85
8/8
✓ Branch 0 taken 28752 times.
✓ Branch 1 taken 460032 times.
✓ Branch 2 taken 115056 times.
✓ Branch 3 taken 920448 times.
✓ Branch 4 taken 459936 times.
✓ Branch 5 taken 1839744 times.
✓ Branch 6 taken 1839168 times.
✓ Branch 7 taken 3678336 times.
9341472 for (size_t index = 0; index < buffer_size; ++index) {
86 6898560 trn_result_b64[index] = vreinterpretq_u64(
87 6898560 vrev64q(reinterpret_cast<SrcVectorType>(trn_result_b64[index])));
88 13797120 trn_result_b64[index] = vcombine(vget_high(trn_result_b64[index]),
89 6898560 vget_low(trn_result_b64[index]));
90 6898560 vst1q(&dst_rows.at(index)[0], trn_result_b64[index]);
91 6898560 }
92 2442912 }
93
94 template <typename ScalarType>
95 66 static void scalar_path(Rows<const ScalarType> src_rows,
96 Rows<ScalarType> dst_rows, size_t height,
97 size_t width) {
98
8/8
✓ Branch 0 taken 39 times.
✓ Branch 1 taken 567 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 9 times.
✓ Branch 6 taken 9 times.
✓ Branch 7 taken 9 times.
660 for (size_t vindex = 0; vindex < height; ++vindex) {
99 594 disable_loop_vectorization();
100
8/8
✓ Branch 0 taken 8820 times.
✓ Branch 1 taken 567 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 9 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 9 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 9 times.
9432 for (size_t hindex = 0; hindex < width; ++hindex) {
101 8838 disable_loop_vectorization();
102 // dst[j][src_height - i - 1] = src[i][j]
103 8838 dst_rows.at(hindex)[height - vindex - 1] = src_rows.at(vindex)[hindex];
104 8838 }
105 594 }
106 66 }
107
108 template <typename ScalarType>
109 474 static kleidicv_error_t rotate(Rectangle rect, Rows<const ScalarType> src_rows,
110 Rows<ScalarType> dst_rows) {
111 474 constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes();
112 22674 auto handle_lane_number_of_rows = [&](size_t vindex) {
113 22200 LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes);
114
115 2465112 horizontal_loop.unroll_once([&](size_t hindex) {
116 // if the input is big enough handle it tile by tile
117 2442912 vector_path<ScalarType>(
118 2442912 src_rows.at(vindex, hindex),
119 2442912 dst_rows.at(hindex, rect.height() - vindex - num_of_lanes));
120 2442912 });
121
122 // This branch is needed even for TryToAvoidTailLoop
123 22224 horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) {
124 48 scalar_path(src_rows.at(vindex, hindex),
125 24 dst_rows.at(hindex, rect.height() - vindex - num_of_lanes),
126 24 num_of_lanes, final_hindex - hindex);
127 24 });
128 22200 };
129
130 474 LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes);
131
132 474 vertical_loop.unroll_once(handle_lane_number_of_rows);
133
134 516 vertical_loop.remaining([&](size_t vindex, size_t final_vindex) {
135 84 scalar_path(src_rows.at(vindex), dst_rows.at(0, 0), final_vindex - vindex,
136 42 rect.width());
137 42 });
138 474 return KLEIDICV_OK;
139 474 }
140
141 template <typename T>
142 558 static kleidicv_error_t rotate(const void *src_void, size_t src_stride,
143 size_t src_width, size_t src_height,
144 void *dst_void, size_t dst_stride) {
145
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 141 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 141 times.
558 MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void);
146
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 138 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 138 times.
549 MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void);
147
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 123 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 123 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 132 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 132 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 132 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 132 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 132 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 132 times.
540 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
148
16/16
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 120 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 126 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 126 times.
✓ Branch 8 taken 6 times.
✓ Branch 9 taken 126 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 126 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 126 times.
✓ Branch 14 taken 6 times.
✓ Branch 15 taken 126 times.
519 CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width);
149
24/24
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 117 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 114 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 114 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 123 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 120 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 120 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 123 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 120 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 120 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 123 times.
✓ Branch 20 taken 3 times.
✓ Branch 21 taken 120 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 120 times.
498 CHECK_IMAGE_SIZE(src_width, src_height);
150
151 474 Rectangle rect{src_width, src_height};
152 474 Rows<T> dst_rows{dst, dst_stride};
153 474 Rows<const T> src_rows{src, src_stride};
154
155 474 return rotate(rect, src_rows, dst_rows);
156 558 }
157
158 KLEIDICV_TARGET_FN_ATTRS
159 567 kleidicv_error_t rotate(const void *src, size_t src_stride, size_t src_width,
160 size_t src_height, void *dst, size_t dst_stride,
161 int angle, size_t element_size) {
162
2/2
✓ Branch 0 taken 9 times.
✓ Branch 1 taken 558 times.
567 if (!rotate_is_implemented(src, dst, angle, element_size)) {
163 9 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
164 }
165
166
4/5
✗ Branch 0 not taken.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 144 times.
✓ Branch 3 taken 144 times.
✓ Branch 4 taken 144 times.
558 switch (element_size) {
167 case sizeof(uint8_t):
168 252 return rotate<uint8_t>(src, src_stride, src_width, src_height, dst,
169 126 dst_stride);
170 case sizeof(uint16_t):
171 288 return rotate<uint16_t>(src, src_stride, src_width, src_height, dst,
172 144 dst_stride);
173 case sizeof(uint32_t):
174 288 return rotate<uint32_t>(src, src_stride, src_width, src_height, dst,
175 144 dst_stride);
176 case sizeof(uint64_t):
177 288 return rotate<uint64_t>(src, src_stride, src_width, src_height, dst,
178 144 dst_stride);
179 // GCOVR_EXCL_START
180 default:
181 assert(!"element size not implemented");
182 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
183 // GCOVR_EXCL_STOP
184 }
185 567 }
186
187 } // namespace kleidicv::neon
188