Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_FLOAT_CONV_SC_H | ||
6 | #define KLEIDICV_FLOAT_CONV_SC_H | ||
7 | |||
8 | #include <limits> | ||
9 | #include <type_traits> | ||
10 | |||
11 | #include "kleidicv/kleidicv.h" | ||
12 | #include "kleidicv/sve2.h" | ||
13 | |||
14 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
15 | |||
16 | template <typename InputType, typename OutputType> | ||
17 | class float_conversion_operation; | ||
18 | |||
19 | template <typename OutputType> | ||
20 | class float_conversion_operation<float, OutputType> { | ||
21 | public: | ||
22 | using SrcVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
23 | using SrcVectorType = typename SrcVecTraits::VectorType; | ||
24 | using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits< | ||
25 | std::conditional_t<std::is_signed_v<OutputType>, int32_t, uint32_t>>; | ||
26 | using IntermediateVectorType = typename IntermediateVecTraits::VectorType; | ||
27 | using DstVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<OutputType>; | ||
28 | using DstVectorType = typename DstVecTraits::VectorType; | ||
29 | |||
30 | 300 | explicit float_conversion_operation(svuint8_t& index) KLEIDICV_STREAMING | |
31 | 300 | : index_(index) { | |
32 | // Index generation to reorder converted values by tbl instruction | ||
33 | 300 | auto index0 = svindex_u8(0, 4); | |
34 | 300 | auto index1 = svindex_u8(1, 4); | |
35 | 300 | auto index2 = svindex_u8(2, 4); | |
36 | 300 | auto index3 = svindex_u8(3, 4); | |
37 | |||
38 | 300 | svbool_t pg = svwhilelt_b8(uint64_t(0), svcntb() / 4); | |
39 | |||
40 | 300 | index_ = svsplice(pg, index3, svdup_u8(0)); | |
41 | 300 | index_ = svsplice(pg, index2, index_); | |
42 | 300 | index_ = svsplice(pg, index1, index_); | |
43 | 300 | index_ = svsplice(pg, index0, index_); | |
44 | 300 | } | |
45 | |||
46 | 474 | void process_row(size_t width, Columns<const float> src, | |
47 | Columns<OutputType> dst) KLEIDICV_STREAMING { | ||
48 | 948 | LoopUnroll{width, SrcVecTraits::num_lanes()} | |
49 | 1604 | .unroll_n_times<4>([&](size_t step) KLEIDICV_STREAMING { | |
50 | 1130 | svbool_t pg = DstVecTraits::svptrue(); | |
51 | 1130 | SrcVectorType src_v0 = svld1(pg, &src[0]); | |
52 | 1130 | SrcVectorType src_v1 = svld1_vnum(pg, &src[0], 1); | |
53 | 1130 | SrcVectorType src_v2 = svld1_vnum(pg, &src[0], 2); | |
54 | 1130 | SrcVectorType src_v3 = svld1_vnum(pg, &src[0], 3); | |
55 | 1130 | DstVectorType res0 = vector_path(pg, src_v0, src_v1, src_v2, src_v3); | |
56 | 1130 | svst1(pg, &dst[0], res0); | |
57 | 1130 | src += ptrdiff_t(step); | |
58 | 1130 | dst += ptrdiff_t(step); | |
59 | 1130 | }) | |
60 | 824 | .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { | |
61 | 350 | size_t index = 0; | |
62 | 350 | svbool_t pg = SrcVecTraits::svwhilelt(index, length); | |
63 |
4/4✓ Branch 0 taken 298 times.
✓ Branch 1 taken 175 times.
✓ Branch 2 taken 298 times.
✓ Branch 3 taken 175 times.
|
946 | while (svptest_first(SrcVecTraits::svptrue(), pg)) { |
64 | 596 | SrcVectorType src_vector = svld1(pg, &src[ptrdiff_t(index)]); | |
65 | 1192 | IntermediateVectorType result_vector = | |
66 | 596 | remaining_path<OutputType>(pg, src_vector); | |
67 | 596 | svst1b(pg, &dst[ptrdiff_t(index)], result_vector); | |
68 | // Update loop counter and calculate the next governing predicate. | ||
69 | 596 | index += SrcVecTraits::num_lanes(); | |
70 | 596 | pg = SrcVecTraits::svwhilelt(index, length); | |
71 | 596 | } | |
72 | 350 | }); | |
73 | 474 | } | |
74 | |||
75 | private: | ||
76 | template < | ||
77 | typename O, | ||
78 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
79 | 2260 | decltype(auto) convert(svbool_t full_pg, | |
80 | SrcVectorType in) KLEIDICV_STREAMING { | ||
81 | 2260 | return svcvt_s32_f32_x(full_pg, in); | |
82 | } | ||
83 | |||
84 | template < | ||
85 | typename O, | ||
86 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
87 | 2260 | decltype(auto) convert(svbool_t full_pg, | |
88 | SrcVectorType in) KLEIDICV_STREAMING { | ||
89 | 2260 | return svcvt_u32_f32_x(full_pg, in); | |
90 | } | ||
91 | |||
92 | 1130 | DstVectorType vector_path(svbool_t full_pg, SrcVectorType fsrc0, | |
93 | SrcVectorType fsrc1, SrcVectorType fsrc2, | ||
94 | SrcVectorType fsrc3) KLEIDICV_STREAMING { | ||
95 | 1130 | fsrc0 = svrinti_f32_x(full_pg, fsrc0); | |
96 | 1130 | fsrc1 = svrinti_f32_x(full_pg, fsrc1); | |
97 | 1130 | fsrc2 = svrinti_f32_x(full_pg, fsrc2); | |
98 | 1130 | fsrc3 = svrinti_f32_x(full_pg, fsrc3); | |
99 | |||
100 | 1130 | auto _32bit_res0 = convert<OutputType>(full_pg, fsrc0); | |
101 | 1130 | auto _32bit_res1 = convert<OutputType>(full_pg, fsrc1); | |
102 | 1130 | auto _32bit_res2 = convert<OutputType>(full_pg, fsrc2); | |
103 | 1130 | auto _32bit_res3 = convert<OutputType>(full_pg, fsrc3); | |
104 | |||
105 | 1130 | auto _16bit_res0 = svqxtnb(_32bit_res0); | |
106 | 1130 | _16bit_res0 = svqxtnt(_16bit_res0, _32bit_res2); | |
107 | |||
108 | 1130 | auto _16bit_res1 = svqxtnb(_32bit_res1); | |
109 | 1130 | _16bit_res1 = svqxtnt(_16bit_res1, _32bit_res3); | |
110 | |||
111 | 1130 | auto _8bit_res = svqxtnb(_16bit_res0); | |
112 | 1130 | _8bit_res = svqxtnt(_8bit_res, _16bit_res1); | |
113 | |||
114 | 2260 | return svtbl(_8bit_res, index_); | |
115 | 1130 | } | |
116 | |||
117 | template < | ||
118 | typename O, | ||
119 | std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0> | ||
120 | 298 | IntermediateVectorType remaining_path(svbool_t& pg, | |
121 | SrcVectorType src) KLEIDICV_STREAMING { | ||
122 | 298 | constexpr float min_val = std::numeric_limits<O>::lowest(); | |
123 | 298 | constexpr float max_val = std::numeric_limits<O>::max(); | |
124 | |||
125 | 298 | src = svrinti_f32_x(pg, src); | |
126 | |||
127 | 298 | svbool_t less = svcmplt_n_f32(pg, src, min_val); | |
128 | 298 | src = svdup_n_f32_m(src, less, min_val); | |
129 | |||
130 | 298 | svbool_t greater = svcmpgt_n_f32(pg, src, max_val); | |
131 | 298 | src = svdup_n_f32_m(src, greater, max_val); | |
132 | |||
133 | 596 | return svcvt_s32_f32_x(pg, src); | |
134 | 298 | } | |
135 | |||
136 | template < | ||
137 | typename O, | ||
138 | std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0> | ||
139 | 298 | IntermediateVectorType remaining_path(svbool_t& pg, | |
140 | SrcVectorType src) KLEIDICV_STREAMING { | ||
141 | 298 | constexpr float max_val = std::numeric_limits<O>::max(); | |
142 | |||
143 | 298 | src = svrinti_f32_x(pg, src); | |
144 | |||
145 | 298 | svbool_t greater = svcmpgt_n_f32(pg, src, max_val); | |
146 | 298 | src = svdup_n_f32_m(src, greater, max_val); | |
147 | |||
148 | 596 | return svcvt_u32_f32_x(pg, src); | |
149 | 298 | } | |
150 | |||
151 | svuint8_t& index_; | ||
152 | }; // end of class float_conversion_operation<float, OutputType> | ||
153 | |||
154 | template <typename InputType> | ||
155 | class float_conversion_operation<InputType, float> { | ||
156 | public: | ||
157 | using VecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>; | ||
158 | using VectorType = typename VecTraits::VectorType; | ||
159 | |||
160 | 300 | explicit float_conversion_operation(svuint8_t&) {} | |
161 | |||
162 | 474 | void process_row(size_t width, Columns<const InputType> src, | |
163 | Columns<float> dst) KLEIDICV_STREAMING { | ||
164 | 948 | LoopUnroll{width, VecTraits::num_lanes()} | |
165 | 2818 | .unroll_twice([&](size_t step) KLEIDICV_STREAMING { | |
166 | 2344 | svbool_t pg = VecTraits::svptrue(); | |
167 | 2344 | auto src_vect1 = load_src(pg, &src[0], 0); | |
168 | 2344 | auto src_vect2 = load_src(pg, &src[0], 1); | |
169 | |||
170 | 2344 | VectorType dst_vector1 = vector_path(pg, src_vect1); | |
171 | 2344 | VectorType dst_vector2 = vector_path(pg, src_vect2); | |
172 | 2344 | svst1(pg, &dst[0], dst_vector1); | |
173 | 2344 | svst1_vnum(pg, &dst[0], 1, dst_vector2); | |
174 | 2344 | src += ptrdiff_t(step); | |
175 | 2344 | dst += ptrdiff_t(step); | |
176 | 2344 | }) | |
177 | 792 | .remaining([&](size_t length, size_t) KLEIDICV_STREAMING { | |
178 | 318 | size_t index = 0; | |
179 | 318 | svbool_t pg = VecTraits::svwhilelt(index, length); | |
180 |
4/4✓ Branch 0 taken 207 times.
✓ Branch 1 taken 159 times.
✓ Branch 2 taken 207 times.
✓ Branch 3 taken 159 times.
|
732 | while (svptest_first(VecTraits::svptrue(), pg)) { |
181 | 414 | auto src_vect = load_src(pg, &src[ptrdiff_t(index)], 0); | |
182 | 414 | VectorType dst_vector = vector_path(pg, src_vect); | |
183 | 414 | svst1(pg, &dst[ptrdiff_t(index)], dst_vector); | |
184 | // Update loop counter and calculate the next governing predicate. | ||
185 | 414 | index += VecTraits::num_lanes(); | |
186 | 414 | pg = VecTraits::svwhilelt(index, length); | |
187 | 414 | } | |
188 | 318 | }); | |
189 | 474 | } | |
190 | |||
191 | private: | ||
192 | template <typename I, std::enable_if_t<std::is_same_v<I, svint32_t>, int> = 0> | ||
193 | 2551 | VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { | |
194 | 2551 | return svcvt_f32_s32_x(pg, src_vector); | |
195 | } | ||
196 | template <typename I, | ||
197 | std::enable_if_t<std::is_same_v<I, svuint32_t>, int> = 0> | ||
198 | 2551 | VectorType vector_path(svbool_t& pg, I src_vector) KLEIDICV_STREAMING { | |
199 | 2551 | return svcvt_f32_u32_x(pg, src_vector); | |
200 | } | ||
201 | |||
202 | template < | ||
203 | typename I, | ||
204 | std::enable_if_t<std::is_integral_v<I> && std::is_signed_v<I>, int> = 0> | ||
205 | 2551 | svint32_t load_src(svbool_t& pg, const I* src, | |
206 | size_t vnum) KLEIDICV_STREAMING { | ||
207 | 2551 | svint32_t src_vect = svld1sb_vnum_s32(pg, src, vnum); | |
208 | 5102 | return src_vect; | |
209 | 2551 | } | |
210 | |||
211 | template < | ||
212 | typename I, | ||
213 | std::enable_if_t<std::is_integral_v<I> && !std::is_signed_v<I>, int> = 0> | ||
214 | 2551 | svuint32_t load_src(svbool_t& pg, const I* src, | |
215 | size_t vnum) KLEIDICV_STREAMING { | ||
216 | 2551 | svuint32_t src_vect = svld1ub_vnum_u32(pg, src, vnum); | |
217 | 5102 | return src_vect; | |
218 | 2551 | } | |
219 | }; // end of class float_conversion_operation<InputType, float> | ||
220 | |||
221 | template <typename InputType, typename OutputType> | ||
222 | 640 | static kleidicv_error_t float_conversion_sc(const InputType* src, | |
223 | size_t src_stride, OutputType* dst, | ||
224 | size_t dst_stride, size_t width, | ||
225 | size_t height) KLEIDICV_STREAMING { | ||
226 |
16/16✓ Branch 0 taken 2 times.
✓ Branch 1 taken 158 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 158 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 158 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 158 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 158 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 158 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 158 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 158 times.
|
640 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
227 |
16/16✓ Branch 0 taken 2 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 156 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 156 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 156 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 156 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 156 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 156 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 156 times.
|
632 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
228 |
24/24✓ Branch 0 taken 2 times.
✓ Branch 1 taken 154 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 150 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 150 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 154 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 150 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 150 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 154 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 150 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 150 times.
✓ Branch 18 taken 2 times.
✓ Branch 19 taken 154 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 150 times.
✓ Branch 22 taken 6 times.
✓ Branch 23 taken 150 times.
|
624 | CHECK_IMAGE_SIZE(width, height); |
229 | |||
230 | 600 | svuint8_t index; | |
231 | 600 | float_conversion_operation<InputType, OutputType> operation{index}; | |
232 | 600 | Rectangle rect{width, height}; | |
233 | 600 | Rows<const InputType> src_rows{src, src_stride}; | |
234 | 600 | Rows<OutputType> dst_rows{dst, dst_stride}; | |
235 | 600 | zip_rows(operation, rect, src_rows, dst_rows); | |
236 | |||
237 | 600 | return KLEIDICV_OK; | |
238 | 640 | } | |
239 | |||
240 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
241 | |||
242 | #endif // KLEIDICV_FLOAT_CONV_SC_H | ||
243 |