| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | |||
| 7 | #include "kleidicv/arithmetics/rotate.h" | ||
| 8 | #include "kleidicv/kleidicv.h" | ||
| 9 | #include "kleidicv/neon.h" | ||
| 10 | |||
| 11 | namespace kleidicv::neon { | ||
| 12 | |||
| 13 | template <const size_t BufferSize, const size_t Order, typename DstVectorType, | ||
| 14 | typename SrcType> | ||
| 15 | 6080496 | static void rotate_vectors_recursively(DstVectorType *dst_vectors, | |
| 16 | Rows<const SrcType> src_rows) { | ||
| 17 | // order is halved at every recursive call, once it is 2 the recursion should | ||
| 18 | // stop and the input data needs to be read. | ||
| 19 | if constexpr (Order == 2) { | ||
| 20 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 21 |
8/8✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 434688 times.
✓ Branch 2 taken 217296 times.
✓ Branch 3 taken 869184 times.
✓ Branch 4 taken 868704 times.
✓ Branch 5 taken 1737408 times.
✓ Branch 6 taken 3473856 times.
✓ Branch 7 taken 3473856 times.
|
11129328 | for (size_t index = 0; index < BufferSize; index += Order) { |
| 22 | using SrcVectorType = typename VecTraits<SrcType>::VectorType; | ||
| 23 | 6515136 | SrcVectorType src_vector[2]; | |
| 24 | |||
| 25 | 6515136 | src_vector[0] = vld1q(&src_rows.at(index + 0)[0]); | |
| 26 | 6515136 | src_vector[1] = vld1q(&src_rows.at(index + 1)[0]); | |
| 27 | |||
| 28 | // If order is 2 then SrcVectorType is the same as DstVectorType | ||
| 29 | 6515136 | dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]); | |
| 30 | 6515136 | dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]); | |
| 31 | 6515136 | } | |
| 32 | } else { | ||
| 33 | // First the input for the current rotate stage, which is the output of | ||
| 34 | // the previous stage, is created. The previous stage rotates | ||
| 35 | // elements half the size of the current stage and its order is also half of | ||
| 36 | // the current one. | ||
| 37 | 1466304 | half_element_width_t<DstVectorType> input[BufferSize]; | |
| 38 | 1466304 | constexpr size_t previous_order = Order / 2; | |
| 39 | |||
| 40 | 1466304 | rotate_vectors_recursively<BufferSize, previous_order>(input, src_rows); | |
| 41 | |||
| 42 | 1466304 | constexpr size_t half_order = Order / 2; | |
| 43 | |||
| 44 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 45 |
12/12✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 54336 times.
✓ Branch 2 taken 54336 times.
✓ Branch 3 taken 108672 times.
✓ Branch 4 taken 54336 times.
✓ Branch 5 taken 217344 times.
✓ Branch 6 taken 217296 times.
✓ Branch 7 taken 217296 times.
✓ Branch 8 taken 217296 times.
✓ Branch 9 taken 434592 times.
✓ Branch 10 taken 868704 times.
✓ Branch 11 taken 868704 times.
|
3367248 | for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) { |
| 46 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 47 |
12/12✓ Branch 0 taken 434688 times.
✓ Branch 1 taken 54336 times.
✓ Branch 2 taken 434688 times.
✓ Branch 3 taken 108672 times.
✓ Branch 4 taken 434688 times.
✓ Branch 5 taken 217344 times.
✓ Branch 6 taken 869184 times.
✓ Branch 7 taken 217296 times.
✓ Branch 8 taken 869184 times.
✓ Branch 9 taken 434592 times.
✓ Branch 10 taken 1737408 times.
✓ Branch 11 taken 868704 times.
|
6680784 | for (size_t inner_i = 0; inner_i < half_order; ++inner_i) { |
| 48 | 4779840 | dst_vectors[outer_i + inner_i] = | |
| 49 | 9559680 | vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
| 50 | 4779840 | reinterpret_cast<DstVectorType>( | |
| 51 | 4779840 | input[outer_i + inner_i + half_order])); | |
| 52 | |||
| 53 | 4779840 | dst_vectors[outer_i + half_order + inner_i] = | |
| 54 | 9559680 | vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
| 55 | 4779840 | reinterpret_cast<DstVectorType>( | |
| 56 | 4779840 | input[outer_i + inner_i + half_order])); | |
| 57 | 4779840 | } | |
| 58 | 1900944 | } | |
| 59 | 1466304 | } | |
| 60 | 6080496 | } | |
| 61 | |||
| 62 | // Rotates one tile of data with vector instructions. The tile's width and | ||
| 63 | // height are the number of Neon lanes for the given type. | ||
| 64 | template <typename ScalarType> | ||
| 65 | 4614192 | static void vector_path(Rows<const ScalarType> src_rows, | |
| 66 | Rows<ScalarType> dst_rows) { | ||
| 67 | 4614192 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
| 68 | using SrcVectorType = typename VecTraits<ScalarType>::VectorType; | ||
| 69 | |||
| 70 | // The number of vectors read and write is the same as the lane count of the | ||
| 71 | // given element size | ||
| 72 | 4614192 | constexpr size_t buffer_size = num_of_lanes; | |
| 73 | |||
| 74 | // Last rotate step is always done on 64 bit elements | ||
| 75 | 4614192 | uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays) | |
| 76 | |||
| 77 | // The 64 bit rotate spans through all the vectors, so its "order" is the | ||
| 78 | // same as the number of vectors | ||
| 79 | 4614192 | constexpr size_t rotate_order_b64 = num_of_lanes; | |
| 80 | |||
| 81 | 9228384 | rotate_vectors_recursively<buffer_size, rotate_order_b64>(trn_result_b64, | |
| 82 | 4614192 | src_rows); | |
| 83 | |||
| 84 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 85 |
8/8✓ Branch 0 taken 54336 times.
✓ Branch 1 taken 869376 times.
✓ Branch 2 taken 217296 times.
✓ Branch 3 taken 1738368 times.
✓ Branch 4 taken 868704 times.
✓ Branch 5 taken 3474816 times.
✓ Branch 6 taken 3473856 times.
✓ Branch 7 taken 6947712 times.
|
17644464 | for (size_t index = 0; index < buffer_size; ++index) { |
| 86 | 13030272 | trn_result_b64[index] = vreinterpretq_u64( | |
| 87 | 13030272 | vrev64q(reinterpret_cast<SrcVectorType>(trn_result_b64[index]))); | |
| 88 | 26060544 | trn_result_b64[index] = vcombine(vget_high(trn_result_b64[index]), | |
| 89 | 13030272 | vget_low(trn_result_b64[index])); | |
| 90 | 13030272 | vst1q(&dst_rows.at(index)[0], trn_result_b64[index]); | |
| 91 | 13030272 | } | |
| 92 | 4614192 | } | |
| 93 | |||
| 94 | template <typename ScalarType> | ||
| 95 | 76 | static void scalar_path(Rows<const ScalarType> src_rows, | |
| 96 | Rows<ScalarType> dst_rows, size_t height, | ||
| 97 | size_t width) { | ||
| 98 |
8/8✓ Branch 0 taken 40 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 12 times.
✓ Branch 3 taken 12 times.
✓ Branch 4 taken 12 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 12 times.
✓ Branch 7 taken 12 times.
|
680 | for (size_t vindex = 0; vindex < height; ++vindex) { |
| 99 | 604 | disable_loop_vectorization(); | |
| 100 |
8/8✓ Branch 0 taken 8820 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 12 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 12 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 12 times.
|
9448 | for (size_t hindex = 0; hindex < width; ++hindex) { |
| 101 | 8844 | disable_loop_vectorization(); | |
| 102 | // dst[j][src_height - i - 1] = src[i][j] | ||
| 103 | 8844 | dst_rows.at(hindex)[height - vindex - 1] = src_rows.at(vindex)[hindex]; | |
| 104 | 8844 | } | |
| 105 | 604 | } | |
| 106 | 76 | } | |
| 107 | |||
| 108 | template <typename ScalarType> | ||
| 109 | 632 | static kleidicv_error_t rotate(Rectangle rect, Rows<const ScalarType> src_rows, | |
| 110 | Rows<ScalarType> dst_rows) { | ||
| 111 | 632 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
| 112 | 37616 | auto handle_lane_number_of_rows = [&](size_t vindex) { | |
| 113 | 36984 | LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes); | |
| 114 | |||
| 115 | 4651176 | horizontal_loop.unroll_once([&](size_t hindex) { | |
| 116 | // if the input is big enough handle it tile by tile | ||
| 117 | 4614192 | vector_path<ScalarType>( | |
| 118 | 4614192 | src_rows.at(vindex, hindex), | |
| 119 | 4614192 | dst_rows.at(hindex, rect.height() - vindex - num_of_lanes)); | |
| 120 | 4614192 | }); | |
| 121 | |||
| 122 | // This branch is needed even for TryToAvoidTailLoop | ||
| 123 | 37008 | horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
| 124 | 48 | scalar_path(src_rows.at(vindex, hindex), | |
| 125 | 24 | dst_rows.at(hindex, rect.height() - vindex - num_of_lanes), | |
| 126 | 24 | num_of_lanes, final_hindex - hindex); | |
| 127 | 24 | }); | |
| 128 | 36984 | }; | |
| 129 | |||
| 130 | 632 | LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes); | |
| 131 | |||
| 132 | 632 | vertical_loop.unroll_once(handle_lane_number_of_rows); | |
| 133 | |||
| 134 | 684 | vertical_loop.remaining([&](size_t vindex, size_t final_vindex) { | |
| 135 | 104 | scalar_path(src_rows.at(vindex), dst_rows.at(0, 0), final_vindex - vindex, | |
| 136 | 52 | rect.width()); | |
| 137 | 52 | }); | |
| 138 | 632 | return KLEIDICV_OK; | |
| 139 | 632 | } | |
| 140 | |||
| 141 | template <typename T> | ||
| 142 | 744 | static kleidicv_error_t rotate(const void *src_void, size_t src_stride, | |
| 143 | size_t src_width, size_t src_height, | ||
| 144 | void *dst_void, size_t dst_stride) { | ||
| 145 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 188 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 188 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 188 times.
|
744 | MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void); |
| 146 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 184 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 184 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 184 times.
|
732 | MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void); |
| 147 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 164 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 164 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 176 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 176 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 176 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 176 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 176 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 176 times.
|
720 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 148 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 160 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 160 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 168 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 168 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 168 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 168 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 168 times.
|
692 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width); |
| 149 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 152 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 152 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 164 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 160 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 160 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 164 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 160 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 160 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 164 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 160 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 160 times.
|
664 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 150 | |||
| 151 | 632 | Rectangle rect{src_width, src_height}; | |
| 152 | 632 | Rows<T> dst_rows{dst, dst_stride}; | |
| 153 | 632 | Rows<const T> src_rows{src, src_stride}; | |
| 154 | |||
| 155 | 632 | return rotate(rect, src_rows, dst_rows); | |
| 156 | 744 | } | |
| 157 | |||
| 158 | KLEIDICV_TARGET_FN_ATTRS | ||
| 159 | 756 | kleidicv_error_t rotate(const void *src, size_t src_stride, size_t src_width, | |
| 160 | size_t src_height, void *dst, size_t dst_stride, | ||
| 161 | int angle, size_t element_size) { | ||
| 162 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 744 times.
|
756 | if (!rotate_is_implemented(src, dst, angle, element_size)) { |
| 163 | 12 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 164 | } | ||
| 165 | |||
| 166 |
4/5✗ Branch 0 not taken.
✓ Branch 1 taken 168 times.
✓ Branch 2 taken 192 times.
✓ Branch 3 taken 192 times.
✓ Branch 4 taken 192 times.
|
744 | switch (element_size) { |
| 167 | case sizeof(uint8_t): | ||
| 168 | 336 | return rotate<uint8_t>(src, src_stride, src_width, src_height, dst, | |
| 169 | 168 | dst_stride); | |
| 170 | case sizeof(uint16_t): | ||
| 171 | 384 | return rotate<uint16_t>(src, src_stride, src_width, src_height, dst, | |
| 172 | 192 | dst_stride); | |
| 173 | case sizeof(uint32_t): | ||
| 174 | 384 | return rotate<uint32_t>(src, src_stride, src_width, src_height, dst, | |
| 175 | 192 | dst_stride); | |
| 176 | case sizeof(uint64_t): | ||
| 177 | 384 | return rotate<uint64_t>(src, src_stride, src_width, src_height, dst, | |
| 178 | 192 | dst_stride); | |
| 179 | // GCOVR_EXCL_START | ||
| 180 | default: | ||
| 181 | assert(!"element size not implemented"); | ||
| 182 | − | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 183 | // GCOVR_EXCL_STOP | ||
| 184 | } | ||
| 185 | 756 | } | |
| 186 | |||
| 187 | } // namespace kleidicv::neon | ||
| 188 |