Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cmath> | ||
6 | |||
7 | #include "kleidicv/conversions/float_conversion.h" | ||
8 | #include "kleidicv/neon.h" | ||
9 | |||
10 | namespace kleidicv::neon { | ||
11 | |||
12 | template <typename InputType, typename OutputType> | ||
13 | class float_conversion_operation; | ||
14 | |||
15 | template <typename OutputType> | ||
16 | class float_conversion_operation<float, OutputType> { | ||
17 | public: | ||
18 | using SrcVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
19 | using SrcVectorType = typename SrcVecTraits::VectorType; | ||
20 | using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits< | ||
21 | std::conditional_t<std::is_signed_v<OutputType>, int32_t, uint32_t>>; | ||
22 | using IntermediateVectorType = typename IntermediateVecTraits::VectorType; | ||
23 | |||
24 | 256 | void process_row(size_t width, Columns<const float> src, | |
25 | Columns<OutputType> dst) { | ||
26 | 512 | LoopUnroll{width, SrcVecTraits::num_lanes()} | |
27 | 2158 | .unroll_twice([&](size_t step) { | |
28 | 1902 | SrcVectorType src_vector1 = vld1q_f32(&src[0]); | |
29 | 3804 | SrcVectorType src_vector2 = | |
30 | 1902 | vld1q_f32(&src[SrcVecTraits::num_lanes()]); | |
31 | 3804 | IntermediateVectorType result_vector1 = | |
32 | 1902 | vector_path<OutputType>(src_vector1); | |
33 | 3804 | IntermediateVectorType result_vector2 = | |
34 | 1902 | vector_path<OutputType>(src_vector2); | |
35 | 3804 | vst1(&dst[0], vqmovn(vcombine(vqmovn(result_vector1), | |
36 | 1902 | vqmovn(result_vector2)))); | |
37 | 1902 | src += ptrdiff_t(step); | |
38 | 1902 | dst += ptrdiff_t(step); | |
39 | 1902 | }) | |
40 | 408 | .remaining([&](size_t length, size_t) { | |
41 |
4/4✓ Branch 0 taken 76 times.
✓ Branch 1 taken 315 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 315 times.
|
782 | for (size_t index = 0; index < length; ++index) { |
42 | 630 | disable_loop_vectorization(); | |
43 | 630 | float f = std::nearbyint(src[ptrdiff_t(index)]); | |
44 |
4/4✓ Branch 0 taken 31 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 31 times.
✓ Branch 3 taken 284 times.
|
630 | if (f > std::numeric_limits<OutputType>::max()) { |
45 | 62 | f = std::numeric_limits<OutputType>::max(); | |
46 |
4/4✓ Branch 0 taken 153 times.
✓ Branch 1 taken 131 times.
✓ Branch 2 taken 151 times.
✓ Branch 3 taken 133 times.
|
630 | } else if (f < std::numeric_limits<OutputType>::lowest()) { |
47 | 264 | f = std::numeric_limits<OutputType>::lowest(); | |
48 | 264 | } | |
49 | 630 | dst[index] = static_cast<OutputType>(f); | |
50 | 630 | } | |
51 | 152 | }); | |
52 | 256 | } | |
53 | |||
54 | private: | ||
55 | template < | ||
56 | typename O, | ||
57 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
58 | 1902 | IntermediateVectorType vector_path(SrcVectorType src) { | |
59 | 1902 | IntermediateVectorType result = vcvtnq_s32_f32(src); | |
60 | 3804 | return result; | |
61 | 1902 | } | |
62 | |||
63 | template < | ||
64 | typename O, | ||
65 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
66 | 1902 | IntermediateVectorType vector_path(SrcVectorType src) { | |
67 | 1902 | IntermediateVectorType result = vcvtnq_u32_f32(src); | |
68 | 3804 | return result; | |
69 | 1902 | } | |
70 | }; // end of class float_conversion_operation<float, OutputType> | ||
71 | |||
72 | template <typename InputType, typename OutputType> | ||
73 | 160 | kleidicv_error_t float_conversion(const InputType* src, size_t src_stride, | |
74 | OutputType* dst, size_t dst_stride, | ||
75 | size_t width, size_t height) { | ||
76 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 79 times.
|
160 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
77 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 78 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 78 times.
|
158 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
78 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 75 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 75 times.
|
156 | CHECK_IMAGE_SIZE(width, height); |
79 | |||
80 | 150 | float_conversion_operation<InputType, OutputType> operation; | |
81 | 150 | Rectangle rect{width, height}; | |
82 | 150 | Rows<const InputType> src_rows{src, src_stride}; | |
83 | 150 | Rows<OutputType> dst_rows{dst, dst_stride}; | |
84 | 150 | zip_rows(operation, rect, src_rows, dst_rows); | |
85 | |||
86 | 150 | return KLEIDICV_OK; | |
87 | 160 | } | |
88 | |||
89 | 80 | kleidicv_error_t f32_to_s8(const float* src, size_t src_stride, int8_t* dst, | |
90 | size_t dst_stride, size_t width, size_t height) { | ||
91 | 80 | return float_conversion(src, src_stride, dst, dst_stride, width, height); | |
92 | } | ||
93 | |||
94 | 80 | kleidicv_error_t f32_to_u8(const float* src, size_t src_stride, uint8_t* dst, | |
95 | size_t dst_stride, size_t width, size_t height) { | ||
96 | 80 | return float_conversion(src, src_stride, dst, dst_stride, width, height); | |
97 | } | ||
98 | |||
99 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t | ||
100 | 80 | s8_to_f32(const int8_t* src, size_t src_stride, float* dst, size_t dst_stride, | |
101 | size_t width, size_t height) { | ||
102 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
|
80 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
103 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
104 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
|
78 | CHECK_IMAGE_SIZE(width, height); |
105 | |||
106 | // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a | ||
107 | // single instruction. | ||
108 | 150 | const uint8x16_t index0 = vcombine_u8(vcreate_u8(0x01ffffff00ffffffULL), | |
109 | 75 | vcreate_u8(0x03ffffff02ffffffULL)); | |
110 | 150 | const uint8x16_t index1 = vcombine_u8(vcreate_u8(0x05ffffff04ffffffULL), | |
111 | 75 | vcreate_u8(0x07ffffff06ffffffULL)); | |
112 | 150 | const uint8x16_t index2 = vcombine_u8(vcreate_u8(0x09ffffff08ffffffULL), | |
113 | 75 | vcreate_u8(0x0bffffff0affffffULL)); | |
114 | 150 | const uint8x16_t index3 = vcombine_u8(vcreate_u8(0x0dffffff0cffffffULL), | |
115 | 75 | vcreate_u8(0x0fffffff0effffffULL)); | |
116 |
2/2✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
|
1570 | for (size_t y = 0; y != height; ++y) { |
117 | 1495 | size_t x = 0; | |
118 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
|
1651 | for (; x + 16 <= width; x += 16) { |
119 | 156 | int8x16_t input = vld1q(src + x); | |
120 | // Widen from 8-bit to 32-bit and shift right 24 bits instead of | ||
121 | // sign-extending. | ||
122 | 156 | int32x4_t a = vreinterpretq_s32_s8(vqtbl1q_s8(input, index0)); | |
123 | 156 | int32x4_t b = vreinterpretq_s32_s8(vqtbl1q_s8(input, index1)); | |
124 | 156 | int32x4_t c = vreinterpretq_s32_s8(vqtbl1q_s8(input, index2)); | |
125 | 156 | int32x4_t d = vreinterpretq_s32_s8(vqtbl1q_s8(input, index3)); | |
126 | // Convert to float and divide by 2^24. | ||
127 | |||
128 | 156 | float32x4x4_t output = { | |
129 | 624 | vcvtq_n_f32_s32(a, 24), | |
130 | 156 | vcvtq_n_f32_s32(b, 24), | |
131 | 156 | vcvtq_n_f32_s32(c, 24), | |
132 | 156 | vcvtq_n_f32_s32(d, 24), | |
133 | }; | ||
134 | 156 | neon::VecTraits<float>::store(output, dst + x); | |
135 | 156 | } | |
136 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
|
6895 | for (; x != width; ++x) { |
137 | 5400 | disable_loop_vectorization(); | |
138 | 5400 | dst[x] = src[x]; | |
139 | 5400 | } | |
140 | |||
141 | 1495 | src += static_cast<ptrdiff_t>(src_stride / sizeof(*src)); | |
142 | 1495 | dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst)); | |
143 | 1495 | } | |
144 | 75 | return KLEIDICV_OK; | |
145 | 80 | } | |
146 | |||
147 | KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t | ||
148 | 80 | u8_to_f32(const uint8_t* src, size_t src_stride, float* dst, size_t dst_stride, | |
149 | size_t width, size_t height) { | ||
150 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
|
80 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
151 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
|
79 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
152 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
|
78 | CHECK_IMAGE_SIZE(width, height); |
153 | |||
154 | // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a | ||
155 | // single instruction. | ||
156 | 150 | const uint8x16_t index0 = vcombine_u8(vcreate_u8(0xffffff01ffffff00ULL), | |
157 | 75 | vcreate_u8(0xffffff03ffffff02ULL)); | |
158 | 150 | const uint8x16_t index1 = vcombine_u8(vcreate_u8(0xffffff05ffffff04ULL), | |
159 | 75 | vcreate_u8(0xffffff07ffffff06ULL)); | |
160 | 150 | const uint8x16_t index2 = vcombine_u8(vcreate_u8(0xffffff09ffffff08ULL), | |
161 | 75 | vcreate_u8(0xffffff0bffffff0aULL)); | |
162 | 150 | const uint8x16_t index3 = vcombine_u8(vcreate_u8(0xffffff0dffffff0cULL), | |
163 | 75 | vcreate_u8(0xffffff0fffffff0eULL)); | |
164 |
2/2✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
|
1570 | for (size_t y = 0; y != height; ++y) { |
165 | 1495 | size_t x = 0; | |
166 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
|
1651 | for (; x + 16 <= width; x += 16) { |
167 | 156 | uint8x16_t input = vld1q(src + x); | |
168 | // Widen from 8-bit to 32-bit | ||
169 | 156 | uint32x4_t a = vreinterpretq_u32_u8(vqtbl1q_u8(input, index0)); | |
170 | 156 | uint32x4_t b = vreinterpretq_u32_u8(vqtbl1q_u8(input, index1)); | |
171 | 156 | uint32x4_t c = vreinterpretq_u32_u8(vqtbl1q_u8(input, index2)); | |
172 | 156 | uint32x4_t d = vreinterpretq_u32_u8(vqtbl1q_u8(input, index3)); | |
173 | |||
174 | 156 | float32x4x4_t output = { | |
175 | 624 | vcvtq_f32_u32(a), | |
176 | 156 | vcvtq_f32_u32(b), | |
177 | 156 | vcvtq_f32_u32(c), | |
178 | 156 | vcvtq_f32_u32(d), | |
179 | }; | ||
180 | 156 | neon::VecTraits<float>::store(output, dst + x); | |
181 | 156 | } | |
182 |
2/2✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
|
6895 | for (; x != width; ++x) { |
183 | 5400 | disable_loop_vectorization(); | |
184 | 5400 | dst[x] = src[x]; | |
185 | 5400 | } | |
186 | |||
187 | 1495 | src += static_cast<ptrdiff_t>(src_stride / sizeof(*src)); | |
188 | 1495 | dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst)); | |
189 | 1495 | } | |
190 | 75 | return KLEIDICV_OK; | |
191 | 80 | } | |
192 | |||
193 | } // namespace kleidicv::neon | ||
194 |