| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cmath> | ||
| 6 | |||
| 7 | #include "kleidicv/conversions/float_conversion.h" | ||
| 8 | #include "kleidicv/neon.h" | ||
| 9 | |||
| 10 | namespace kleidicv::neon { | ||
| 11 | |||
| 12 | template <typename InputType, typename OutputType> | ||
| 13 | class float_conversion_operation; | ||
| 14 | |||
| 15 | template <typename OutputType> | ||
| 16 | class float_conversion_operation<float, OutputType> { | ||
| 17 | public: | ||
| 18 | using SrcVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
| 19 | using SrcVectorType = typename SrcVecTraits::VectorType; | ||
| 20 | using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits< | ||
| 21 | std::conditional_t<std::is_signed_v<OutputType>, int32_t, uint32_t>>; | ||
| 22 | using IntermediateVectorType = typename IntermediateVecTraits::VectorType; | ||
| 23 | |||
| 24 | 256 | void process_row(size_t width, Columns<const float> src, | |
| 25 | Columns<OutputType> dst) { | ||
| 26 | 512 | LoopUnroll{width, SrcVecTraits::num_lanes()} | |
| 27 | 2158 | .unroll_twice([&](size_t step) { | |
| 28 | 1902 | SrcVectorType src_vector1 = vld1q_f32(&src[0]); | |
| 29 | 3804 | SrcVectorType src_vector2 = | |
| 30 | 1902 | vld1q_f32(&src[SrcVecTraits::num_lanes()]); | |
| 31 | 3804 | IntermediateVectorType result_vector1 = | |
| 32 | 1902 | vector_path<OutputType>(src_vector1); | |
| 33 | 3804 | IntermediateVectorType result_vector2 = | |
| 34 | 1902 | vector_path<OutputType>(src_vector2); | |
| 35 | 3804 | vst1(&dst[0], vqmovn(vcombine(vqmovn(result_vector1), | |
| 36 | 1902 | vqmovn(result_vector2)))); | |
| 37 | 1902 | src += ptrdiff_t(step); | |
| 38 | 1902 | dst += ptrdiff_t(step); | |
| 39 | 1902 | }) | |
| 40 | 408 | .remaining([&](size_t length, size_t) { | |
| 41 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 315 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 315 times.
|
782 | for (size_t index = 0; index < length; ++index) { |
| 42 | 630 | disable_loop_vectorization(); | |
| 43 | 630 | float f = std::nearbyint(src[ptrdiff_t(index)]); | |
| 44 |
4/4✓ Branch 0 taken 23 times.
✓ Branch 1 taken 292 times.
✓ Branch 2 taken 23 times.
✓ Branch 3 taken 292 times.
|
630 | if (f > std::numeric_limits<OutputType>::max()) { |
| 45 | 46 | f = std::numeric_limits<OutputType>::max(); | |
| 46 |
4/4✓ Branch 0 taken 159 times.
✓ Branch 1 taken 133 times.
✓ Branch 2 taken 157 times.
✓ Branch 3 taken 135 times.
|
630 | } else if (f < std::numeric_limits<OutputType>::lowest()) { |
| 47 | 268 | f = std::numeric_limits<OutputType>::lowest(); | |
| 48 | 268 | } | |
| 49 | 630 | dst[index] = static_cast<OutputType>(f); | |
| 50 | 630 | } | |
| 51 | 152 | }); | |
| 52 | 256 | } | |
| 53 | |||
| 54 | private: | ||
| 55 | template < | ||
| 56 | typename O, | ||
| 57 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
| 58 | 1902 | IntermediateVectorType vector_path(SrcVectorType src) { | |
| 59 | 1902 | IntermediateVectorType result = vcvtnq_s32_f32(src); | |
| 60 | 3804 | return result; | |
| 61 | 1902 | } | |
| 62 | |||
| 63 | template < | ||
| 64 | typename O, | ||
| 65 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
| 66 | 1902 | IntermediateVectorType vector_path(SrcVectorType src) { | |
| 67 | 1902 | IntermediateVectorType result = vcvtnq_u32_f32(src); | |
| 68 | 3804 | return result; | |
| 69 | 1902 | } | |
| 70 | }; // end of class float_conversion_operation<float, OutputType> | ||
| 71 | |||
| 72 | template <typename InputType, typename OutputType> | ||
| 73 | 160 | kleidicv_error_t float_conversion(const InputType* src, size_t src_stride, | |
| 74 | OutputType* dst, size_t dst_stride, | ||
| 75 | size_t width, size_t height) { | ||
| 76 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 79 times.
|
160 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 77 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 78 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 78 times.
|
158 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 78 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 75 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 75 times.
|
156 | CHECK_IMAGE_SIZE(width, height); |
| 79 | |||
| 80 | 150 | float_conversion_operation<InputType, OutputType> operation; | |
| 81 | 150 | Rectangle rect{width, height}; | |
| 82 | 150 | Rows<const InputType> src_rows{src, src_stride}; | |
| 83 | 150 | Rows<OutputType> dst_rows{dst, dst_stride}; | |
| 84 | 150 | zip_rows(operation, rect, src_rows, dst_rows); | |
| 85 | |||
| 86 | 150 | return KLEIDICV_OK; | |
| 87 | 160 | } | |
| 88 | |||
| 89 | 80 | kleidicv_error_t f32_to_s8(const float* src, size_t src_stride, int8_t* dst, | |
| 90 | size_t dst_stride, size_t width, size_t height) { | ||
| 91 | 80 | return float_conversion(src, src_stride, dst, dst_stride, width, height); | |
| 92 | } | ||
| 93 | |||
| 94 | 80 | kleidicv_error_t f32_to_u8(const float* src, size_t src_stride, uint8_t* dst, | |
| 95 | size_t dst_stride, size_t width, size_t height) { | ||
| 96 | 80 | return float_conversion(src, src_stride, dst, dst_stride, width, height); | |
| 97 | } | ||
| 98 | |||
| 99 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t | ||
| 100 | 80 | s8_to_f32(const int8_t* src, size_t src_stride, float* dst, size_t dst_stride, | |
| 101 | size_t width, size_t height) { | ||
| 102 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
|
80 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 103 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 104 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
|
78 | CHECK_IMAGE_SIZE(width, height); |
| 105 | |||
| 106 | // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a | ||
| 107 | // single instruction. | ||
| 108 | 150 | const uint8x16_t index0 = vcombine_u8(vcreate_u8(0x01ffffff00ffffffULL), | |
| 109 | 75 | vcreate_u8(0x03ffffff02ffffffULL)); | |
| 110 | 150 | const uint8x16_t index1 = vcombine_u8(vcreate_u8(0x05ffffff04ffffffULL), | |
| 111 | 75 | vcreate_u8(0x07ffffff06ffffffULL)); | |
| 112 | 150 | const uint8x16_t index2 = vcombine_u8(vcreate_u8(0x09ffffff08ffffffULL), | |
| 113 | 75 | vcreate_u8(0x0bffffff0affffffULL)); | |
| 114 | 150 | const uint8x16_t index3 = vcombine_u8(vcreate_u8(0x0dffffff0cffffffULL), | |
| 115 | 75 | vcreate_u8(0x0fffffff0effffffULL)); | |
| 116 |
2/2✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
|
1570 | for (size_t y = 0; y != height; ++y) { |
| 117 | 1495 | size_t x = 0; | |
| 118 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
|
1651 | for (; x + 16 <= width; x += 16) { |
| 119 | 156 | int8x16_t input = vld1q(src + x); | |
| 120 | // Widen from 8-bit to 32-bit and shift right 24 bits instead of | ||
| 121 | // sign-extending. | ||
| 122 | 156 | int32x4_t a = vreinterpretq_s32_s8(vqtbl1q_s8(input, index0)); | |
| 123 | 156 | int32x4_t b = vreinterpretq_s32_s8(vqtbl1q_s8(input, index1)); | |
| 124 | 156 | int32x4_t c = vreinterpretq_s32_s8(vqtbl1q_s8(input, index2)); | |
| 125 | 156 | int32x4_t d = vreinterpretq_s32_s8(vqtbl1q_s8(input, index3)); | |
| 126 | // Convert to float and divide by 2^24. | ||
| 127 | |||
| 128 | 156 | float32x4x4_t output = { | |
| 129 | 624 | vcvtq_n_f32_s32(a, 24), | |
| 130 | 156 | vcvtq_n_f32_s32(b, 24), | |
| 131 | 156 | vcvtq_n_f32_s32(c, 24), | |
| 132 | 156 | vcvtq_n_f32_s32(d, 24), | |
| 133 | }; | ||
| 134 | 156 | neon::VecTraits<float>::store(output, dst + x); | |
| 135 | 156 | } | |
| 136 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
|
6895 | for (; x != width; ++x) { |
| 137 | 5400 | disable_loop_vectorization(); | |
| 138 | 5400 | dst[x] = src[x]; | |
| 139 | 5400 | } | |
| 140 | |||
| 141 | 1495 | src += static_cast<ptrdiff_t>(src_stride / sizeof(*src)); | |
| 142 | 1495 | dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst)); | |
| 143 | 1495 | } | |
| 144 | 75 | return KLEIDICV_OK; | |
| 145 | 80 | } | |
| 146 | |||
| 147 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t | ||
| 148 | 80 | u8_to_f32(const uint8_t* src, size_t src_stride, float* dst, size_t dst_stride, | |
| 149 | size_t width, size_t height) { | ||
| 150 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
|
80 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 151 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 152 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
|
78 | CHECK_IMAGE_SIZE(width, height); |
| 153 | |||
| 154 | // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a | ||
| 155 | // single instruction. | ||
| 156 | 150 | const uint8x16_t index0 = vcombine_u8(vcreate_u8(0xffffff01ffffff00ULL), | |
| 157 | 75 | vcreate_u8(0xffffff03ffffff02ULL)); | |
| 158 | 150 | const uint8x16_t index1 = vcombine_u8(vcreate_u8(0xffffff05ffffff04ULL), | |
| 159 | 75 | vcreate_u8(0xffffff07ffffff06ULL)); | |
| 160 | 150 | const uint8x16_t index2 = vcombine_u8(vcreate_u8(0xffffff09ffffff08ULL), | |
| 161 | 75 | vcreate_u8(0xffffff0bffffff0aULL)); | |
| 162 | 150 | const uint8x16_t index3 = vcombine_u8(vcreate_u8(0xffffff0dffffff0cULL), | |
| 163 | 75 | vcreate_u8(0xffffff0fffffff0eULL)); | |
| 164 |
2/2✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
|
1570 | for (size_t y = 0; y != height; ++y) { |
| 165 | 1495 | size_t x = 0; | |
| 166 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
|
1651 | for (; x + 16 <= width; x += 16) { |
| 167 | 156 | uint8x16_t input = vld1q(src + x); | |
| 168 | // Widen from 8-bit to 32-bit | ||
| 169 | 156 | uint32x4_t a = vreinterpretq_u32_u8(vqtbl1q_u8(input, index0)); | |
| 170 | 156 | uint32x4_t b = vreinterpretq_u32_u8(vqtbl1q_u8(input, index1)); | |
| 171 | 156 | uint32x4_t c = vreinterpretq_u32_u8(vqtbl1q_u8(input, index2)); | |
| 172 | 156 | uint32x4_t d = vreinterpretq_u32_u8(vqtbl1q_u8(input, index3)); | |
| 173 | |||
| 174 | 156 | float32x4x4_t output = { | |
| 175 | 624 | vcvtq_f32_u32(a), | |
| 176 | 156 | vcvtq_f32_u32(b), | |
| 177 | 156 | vcvtq_f32_u32(c), | |
| 178 | 156 | vcvtq_f32_u32(d), | |
| 179 | }; | ||
| 180 | 156 | neon::VecTraits<float>::store(output, dst + x); | |
| 181 | 156 | } | |
| 182 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
|
6895 | for (; x != width; ++x) { |
| 183 | 5400 | disable_loop_vectorization(); | |
| 184 | 5400 | dst[x] = src[x]; | |
| 185 | 5400 | } | |
| 186 | |||
| 187 | 1495 | src += static_cast<ptrdiff_t>(src_stride / sizeof(*src)); | |
| 188 | 1495 | dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst)); | |
| 189 | 1495 | } | |
| 190 | 75 | return KLEIDICV_OK; | |
| 191 | 80 | } | |
| 192 | |||
| 193 | } // namespace kleidicv::neon | ||
| 194 |