| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/arithmetics/transpose.h" | ||
| 6 | #include "kleidicv/kleidicv.h" | ||
| 7 | #include "kleidicv/neon.h" | ||
| 8 | |||
| 9 | namespace kleidicv::neon { | ||
| 10 | |||
| 11 | template <const size_t BufferSize, const size_t Order, typename DstVectorType, | ||
| 12 | typename SrcType> | ||
| 13 | 27200 | static void transpose_vectors_recursively(DstVectorType *dst_vectors, | |
| 14 | Rows<const SrcType> src_rows) { | ||
| 15 | // order is halved at every recursive call, once it is 2 the recursion should | ||
| 16 | // stop and the input data needs to be read. | ||
| 17 | if constexpr (Order == 2) { | ||
| 18 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 19 |
8/8✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 21760 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 10880 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 5440 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 2720 times.
|
51680 | for (size_t index = 0; index < BufferSize; index += Order) { |
| 20 | using SrcVectorType = typename VecTraits<SrcType>::VectorType; | ||
| 21 | 40800 | SrcVectorType src_vector[2]; | |
| 22 | |||
| 23 | 40800 | src_vector[0] = vld1q(&src_rows.at(index + 0)[0]); | |
| 24 | 40800 | src_vector[1] = vld1q(&src_rows.at(index + 1)[0]); | |
| 25 | |||
| 26 | // If order is 2 than SrcVectorType is the same as DstVectorType | ||
| 27 | 40800 | dst_vectors[index + 0] = vtrn1q(src_vector[0], src_vector[1]); | |
| 28 | 40800 | dst_vectors[index + 1] = vtrn2q(src_vector[0], src_vector[1]); | |
| 29 | 40800 | } | |
| 30 | } else { | ||
| 31 | // First we need to create the input for the current transpose stage, which | ||
| 32 | // is the output of the previous stage. The previous stage transposes | ||
| 33 | // elements half the size of the current stage and its order is also half of | ||
| 34 | // the current one. | ||
| 35 | 16320 | half_element_width_t<DstVectorType> input[BufferSize]; | |
| 36 | 16320 | constexpr size_t previous_order = Order / 2; | |
| 37 | |||
| 38 | 16320 | transpose_vectors_recursively<BufferSize, previous_order>(input, src_rows); | |
| 39 | |||
| 40 | 16320 | constexpr size_t half_order = Order / 2; | |
| 41 | |||
| 42 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 43 |
12/12✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 2720 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 5440 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 2720 times.
✓ Branch 8 taken 2720 times.
✓ Branch 9 taken 5440 times.
✓ Branch 10 taken 2720 times.
✓ Branch 11 taken 2720 times.
|
46240 | for (size_t outer_i = 0; outer_i < BufferSize; outer_i += Order) { |
| 44 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 45 |
12/12✓ Branch 0 taken 21760 times.
✓ Branch 1 taken 2720 times.
✓ Branch 2 taken 21760 times.
✓ Branch 3 taken 5440 times.
✓ Branch 4 taken 21760 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 10880 times.
✓ Branch 7 taken 2720 times.
✓ Branch 8 taken 10880 times.
✓ Branch 9 taken 5440 times.
✓ Branch 10 taken 5440 times.
✓ Branch 11 taken 2720 times.
|
122400 | for (size_t inner_i = 0; inner_i < half_order; ++inner_i) { |
| 46 | 92480 | dst_vectors[outer_i + inner_i] = | |
| 47 | 184960 | vtrn1q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
| 48 | 92480 | reinterpret_cast<DstVectorType>( | |
| 49 | 92480 | input[outer_i + inner_i + half_order])); | |
| 50 | |||
| 51 | 92480 | dst_vectors[outer_i + half_order + inner_i] = | |
| 52 | 184960 | vtrn2q(reinterpret_cast<DstVectorType>(input[outer_i + inner_i]), | |
| 53 | 92480 | reinterpret_cast<DstVectorType>( | |
| 54 | 92480 | input[outer_i + inner_i + half_order])); | |
| 55 | 92480 | } | |
| 56 | 29920 | } | |
| 57 | 16320 | } | |
| 58 | 27200 | } | |
| 59 | |||
| 60 | // Transposes one tile of data with vector instructions. The tile's width and | ||
| 61 | // height are the number of NEON lanes for the given type. | ||
| 62 | template <typename ScalarType> | ||
| 63 | 10880 | static void vector_path(Rows<const ScalarType> src_rows, | |
| 64 | Rows<ScalarType> dst_rows) { | ||
| 65 | 10880 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
| 66 | |||
| 67 | // The number of vectors read and write is the same as the lane count of the | ||
| 68 | // given element size | ||
| 69 | 10880 | constexpr size_t buffer_size = num_of_lanes; | |
| 70 | |||
| 71 | // Last transpose step is always done on 64 bit elements | ||
| 72 | 10880 | uint64x2_t trn_result_b64[buffer_size]; // NOLINT(runtime/arrays) | |
| 73 | |||
| 74 | // The 64 bit transpose spans through all the vectors, so its "order" is the | ||
| 75 | // same as the number of vectors | ||
| 76 | 10880 | constexpr size_t transpose_order_b64 = num_of_lanes; | |
| 77 | |||
| 78 | 10880 | transpose_vectors_recursively<buffer_size, transpose_order_b64>( | |
| 79 | 10880 | trn_result_b64, src_rows); | |
| 80 | |||
| 81 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 82 |
8/8✓ Branch 0 taken 2720 times.
✓ Branch 1 taken 43520 times.
✓ Branch 2 taken 2720 times.
✓ Branch 3 taken 21760 times.
✓ Branch 4 taken 2720 times.
✓ Branch 5 taken 10880 times.
✓ Branch 6 taken 2720 times.
✓ Branch 7 taken 5440 times.
|
92480 | for (size_t index = 0; index < buffer_size; ++index) { |
| 83 | 81600 | vst1q(&dst_rows.at(index)[0], trn_result_b64[index]); | |
| 84 | 81600 | } | |
| 85 | 10880 | } | |
| 86 | |||
| 87 | template <typename ScalarType> | ||
| 88 | 76 | static void scalar_path(Rows<const ScalarType> src_rows, | |
| 89 | Rows<ScalarType> dst_rows, size_t height, | ||
| 90 | size_t width) { | ||
| 91 |
8/8✓ Branch 0 taken 16 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 20 times.
✓ Branch 3 taken 100 times.
✓ Branch 4 taken 20 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 20 times.
✓ Branch 7 taken 28 times.
|
448 | for (size_t vindex = 0; vindex < height; ++vindex) { |
| 92 | 372 | disable_loop_vectorization(); | |
| 93 |
8/8✓ Branch 0 taken 2940 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 704 times.
✓ Branch 3 taken 100 times.
✓ Branch 4 taken 160 times.
✓ Branch 5 taken 52 times.
✓ Branch 6 taken 32 times.
✓ Branch 7 taken 28 times.
|
4208 | for (size_t hindex = 0; hindex < width; ++hindex) { |
| 94 | 3836 | disable_loop_vectorization(); | |
| 95 | 3836 | dst_rows.at(hindex)[vindex] = src_rows.at(vindex)[hindex]; | |
| 96 | 3836 | } | |
| 97 | 372 | } | |
| 98 | 76 | } | |
| 99 | |||
| 100 | template <typename ScalarType> | ||
| 101 | 204 | static kleidicv_error_t transpose(Rectangle rect, | |
| 102 | Rows<const ScalarType> src_rows, | ||
| 103 | Rows<ScalarType> dst_rows) { | ||
| 104 | 204 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
| 105 | |||
| 106 | 1132 | auto handle_lane_number_of_rows = [&](size_t vindex) { | |
| 107 | 928 | LoopUnroll2<TryToAvoidTailLoop> horizontal_loop(rect.width(), num_of_lanes); | |
| 108 | |||
| 109 | 8576 | horizontal_loop.unroll_once([&](size_t hindex) { | |
| 110 | // if the input is big enough handle it tile by tile | ||
| 111 | 15296 | vector_path<ScalarType>(src_rows.at(vindex, hindex), | |
| 112 | 7648 | dst_rows.at(hindex, vindex)); | |
| 113 | 7648 | }); | |
| 114 | |||
| 115 | 960 | horizontal_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
| 116 | 64 | scalar_path(src_rows.at(vindex, hindex), dst_rows.at(hindex, vindex), | |
| 117 | 32 | num_of_lanes, final_hindex - hindex); | |
| 118 | 32 | }); | |
| 119 | 928 | }; | |
| 120 | |||
| 121 | 204 | LoopUnroll2<TryToAvoidTailLoop> vertical_loop(rect.height(), num_of_lanes); | |
| 122 | |||
| 123 | 204 | vertical_loop.unroll_once(handle_lane_number_of_rows); | |
| 124 | |||
| 125 | 248 | vertical_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
| 126 | 88 | scalar_path(src_rows.at(hindex), dst_rows.at(0, hindex), | |
| 127 | 44 | final_hindex - hindex, rect.width()); | |
| 128 | 44 | }); | |
| 129 | 204 | return KLEIDICV_OK; | |
| 130 | 204 | } | |
| 131 | |||
| 132 | template <typename ScalarType> | ||
| 133 | 96 | static kleidicv_error_t transpose(Rectangle rect, Rows<ScalarType> data_rows) { | |
| 134 | 96 | constexpr size_t num_of_lanes = VecTraits<ScalarType>::num_lanes(); | |
| 135 | |||
| 136 | // rect.width() needs to be equal to rect.height() | ||
| 137 | 96 | LoopUnroll2 outer_loop(rect.width(), num_of_lanes); | |
| 138 | |||
| 139 | 512 | outer_loop.unroll_once([&](size_t vindex) { | |
| 140 | // Handle tiles on the diagonal line | ||
| 141 | 832 | vector_path<ScalarType>(data_rows.at(vindex, vindex), | |
| 142 | 416 | data_rows.at(vindex, vindex)); | |
| 143 | |||
| 144 | // Handle the top right half | ||
| 145 |
8/8✓ Branch 0 taken 8 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 96 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 96 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 96 times.
|
416 | if (rect.width() > (vindex + num_of_lanes)) { |
| 146 | // Indexes are running through only the top right half | ||
| 147 | 384 | LoopUnroll2 inner_loop(vindex + num_of_lanes, rect.width(), num_of_lanes); | |
| 148 | |||
| 149 | 1792 | inner_loop.unroll_once([&](size_t hindex) { | |
| 150 | // Allocate temporary memory for one tile | ||
| 151 | 1408 | ScalarType tmp[num_of_lanes * num_of_lanes]; // NOLINT(runtime/arrays) | |
| 152 | 1408 | Rows<ScalarType> tmp_rows{tmp, num_of_lanes * sizeof(ScalarType)}; | |
| 153 | |||
| 154 | // Transpose a tile from the top right area, save the result | ||
| 155 | // into temporary memory | ||
| 156 | 1408 | vector_path<ScalarType>(data_rows.at(vindex, hindex), tmp_rows); | |
| 157 | // Transpose its mirror tile from the left bottom area, save the | ||
| 158 | // result to its final space | ||
| 159 | 2816 | vector_path<ScalarType>(data_rows.at(hindex, vindex), | |
| 160 | 1408 | data_rows.at(vindex, hindex)); | |
| 161 | // Copy the temprory result to its final destination | ||
| 162 | 2816 | Rows<const ScalarType> const_tmp_rows{ | |
| 163 | 1408 | tmp, num_of_lanes * sizeof(ScalarType)}; | |
| 164 | 1408 | CopyNonOverlappingRows<ScalarType>::copy_rows( | |
| 165 | 1408 | Rectangle{num_of_lanes, num_of_lanes}, const_tmp_rows, | |
| 166 | 1408 | data_rows.at(hindex, vindex)); | |
| 167 | 1408 | }); | |
| 168 | |||
| 169 | 640 | inner_loop.remaining([&](size_t hindex, size_t final_hindex) { | |
| 170 | // As this is the unroll_once path of the outer_loop there is | ||
| 171 | // num_of_lanes worth of data in the vertical direction | ||
| 172 |
8/8✓ Branch 0 taken 64 times.
✓ Branch 1 taken 1024 times.
✓ Branch 2 taken 64 times.
✓ Branch 3 taken 512 times.
✓ Branch 4 taken 64 times.
✓ Branch 5 taken 256 times.
✓ Branch 6 taken 64 times.
✓ Branch 7 taken 128 times.
|
2176 | for (size_t i = vindex; i < vindex + num_of_lanes; ++i) { |
| 173 | 1920 | disable_loop_vectorization(); | |
| 174 |
8/8✓ Branch 0 taken 15360 times.
✓ Branch 1 taken 1024 times.
✓ Branch 2 taken 3584 times.
✓ Branch 3 taken 512 times.
✓ Branch 4 taken 768 times.
✓ Branch 5 taken 256 times.
✓ Branch 6 taken 128 times.
✓ Branch 7 taken 128 times.
|
21760 | for (size_t j = hindex; j < final_hindex; ++j) { |
| 175 | 19840 | disable_loop_vectorization(); | |
| 176 | 19840 | std::swap(data_rows.at(i)[j], data_rows.at(j)[i]); | |
| 177 | 19840 | } | |
| 178 | 1920 | } | |
| 179 | 256 | }); | |
| 180 | 384 | } | |
| 181 | 416 | }); | |
| 182 | |||
| 183 | 160 | outer_loop.remaining([&](size_t vindex, size_t final_vindex) { | |
| 184 |
8/8✓ Branch 0 taken 16 times.
✓ Branch 1 taken 240 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 48 times.
✓ Branch 6 taken 16 times.
✓ Branch 7 taken 16 times.
|
480 | for (size_t i = vindex; i < final_vindex; ++i) { |
| 185 | 416 | disable_loop_vectorization(); | |
| 186 | // Only the top right half pixels need to be indexed | ||
| 187 |
7/8✓ Branch 0 taken 1680 times.
✓ Branch 1 taken 240 times.
✓ Branch 2 taken 336 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 48 times.
✓ Branch 5 taken 48 times.
✗ Branch 6 not taken.
✓ Branch 7 taken 16 times.
|
2480 | for (size_t j = i + 1; j < final_vindex; ++j) { |
| 188 | 2064 | disable_loop_vectorization(); | |
| 189 | 2064 | std::swap(data_rows.at(i)[j], data_rows.at(j)[i]); | |
| 190 | 2064 | } | |
| 191 | 416 | } | |
| 192 | 64 | }); | |
| 193 | 96 | return KLEIDICV_OK; | |
| 194 | 96 | } | |
| 195 | |||
| 196 | template <typename T> | ||
| 197 | 428 | static kleidicv_error_t transpose(const void *src_void, size_t src_stride, | |
| 198 | void *dst_void, size_t dst_stride, | ||
| 199 | size_t src_width, size_t src_height) { | ||
| 200 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 108 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 108 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 108 times.
|
428 | MAKE_POINTER_CHECK_ALIGNMENT(const T, src, src_void); |
| 201 |
6/6✓ Branch 0 taken 4 times.
✓ Branch 1 taken 104 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 104 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 104 times.
|
416 | MAKE_POINTER_CHECK_ALIGNMENT(T, dst, dst_void); |
| 202 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 88 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 96 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 96 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 96 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 96 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 96 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 96 times.
|
404 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 203 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 84 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 88 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 88 times.
✓ Branch 8 taken 8 times.
✓ Branch 9 taken 88 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 88 times.
✓ Branch 12 taken 8 times.
✓ Branch 13 taken 88 times.
✓ Branch 14 taken 8 times.
✓ Branch 15 taken 88 times.
|
376 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_width); |
| 204 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 84 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 84 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 80 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 84 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 80 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 80 times.
|
348 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 205 | |||
| 206 | 316 | Rectangle rect{src_width, src_height}; | |
| 207 | 316 | Rows<T> dst_rows{dst, dst_stride}; | |
| 208 | |||
| 209 |
8/8✓ Branch 0 taken 48 times.
✓ Branch 1 taken 28 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 28 times.
✓ Branch 4 taken 52 times.
✓ Branch 5 taken 28 times.
✓ Branch 6 taken 52 times.
✓ Branch 7 taken 28 times.
|
316 | if (src == dst) { |
| 210 |
8/8✓ Branch 0 taken 4 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 24 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 24 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 24 times.
|
112 | if (src_width != src_height) { |
| 211 | // Inplace transpose only implemented if width and height are the same | ||
| 212 | 16 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 213 | } | ||
| 214 | 96 | return transpose(rect, dst_rows); | |
| 215 | } | ||
| 216 | 204 | Rows<const T> src_rows{src, src_stride}; | |
| 217 | 204 | return transpose(rect, src_rows, dst_rows); | |
| 218 | 428 | } | |
| 219 | |||
| 220 | KLEIDICV_TARGET_FN_ATTRS | ||
| 221 | 432 | kleidicv_error_t transpose(const void *src, size_t src_stride, void *dst, | |
| 222 | size_t dst_stride, size_t src_width, | ||
| 223 | size_t src_height, size_t element_size) { | ||
| 224 |
5/5✓ Branch 0 taken 112 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 92 times.
✓ Branch 3 taken 112 times.
✓ Branch 4 taken 112 times.
|
432 | switch (element_size) { |
| 225 | case sizeof(uint8_t): | ||
| 226 | 184 | return transpose<uint8_t>(src, src_stride, dst, dst_stride, src_width, | |
| 227 | 92 | src_height); | |
| 228 | case sizeof(uint16_t): | ||
| 229 | 224 | return transpose<uint16_t>(src, src_stride, dst, dst_stride, src_width, | |
| 230 | 112 | src_height); | |
| 231 | case sizeof(uint32_t): | ||
| 232 | 224 | return transpose<uint32_t>(src, src_stride, dst, dst_stride, src_width, | |
| 233 | 112 | src_height); | |
| 234 | case sizeof(uint64_t): | ||
| 235 | 224 | return transpose<uint64_t>(src, src_stride, dst, dst_stride, src_width, | |
| 236 | 112 | src_height); | |
| 237 | default: | ||
| 238 | 4 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 239 | } | ||
| 240 | 432 | } | |
| 241 | |||
| 242 | } // namespace kleidicv::neon | ||
| 243 |