Line |
Branch |
Exec |
Source |
1 |
|
|
// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> |
2 |
|
|
// |
3 |
|
|
// SPDX-License-Identifier: Apache-2.0 |
4 |
|
|
|
5 |
|
|
#include <cassert> |
6 |
|
|
#include <cmath> |
7 |
|
|
#include <cstddef> |
8 |
|
|
#include <cstdint> |
9 |
|
|
|
10 |
|
|
#include "kleidicv/sve2.h" |
11 |
|
|
#include "kleidicv/types.h" |
12 |
|
|
#include "transform_common.h" |
13 |
|
|
|
14 |
|
|
namespace kleidicv::sve2 { |
15 |
|
|
|
16 |
|
|
template <typename ScalarType, bool IsLarge> |
17 |
|
1686734 |
svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y, |
18 |
|
|
svuint32_t sv_src_stride, |
19 |
|
|
Rows<const ScalarType> &src_rows) { |
20 |
|
|
if constexpr (IsLarge) { |
21 |
|
7380 |
svbool_t pg_b = pg; |
22 |
|
7380 |
svbool_t pg_t = svtrn2_b32(pg, svpfalse()); |
23 |
|
|
|
24 |
|
|
// Calculate offsets from coordinates (y * stride + x) |
25 |
|
|
// To avoid losing precision, the final offsets should be in 64 bits |
26 |
|
7380 |
svuint64_t result_b, result_t; |
27 |
|
|
if constexpr (std::is_same<ScalarType, uint8_t>::value) { |
28 |
|
6180 |
svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); |
29 |
|
6180 |
svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); |
30 |
|
|
// Copy pixels from source |
31 |
|
6180 |
result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); |
32 |
|
6180 |
result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); |
33 |
|
6180 |
} |
34 |
|
|
if constexpr (std::is_same<ScalarType, uint16_t>::value) { |
35 |
|
|
// Multiply x with sizeof(uint16_t) |
36 |
|
1200 |
svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); |
37 |
|
1200 |
svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); |
38 |
|
|
// Copy pixels from source |
39 |
|
1200 |
result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b); |
40 |
|
1200 |
result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t); |
41 |
|
1200 |
} |
42 |
|
22140 |
return svtrn1_u32(svreinterpret_u32_u64(result_b), |
43 |
|
7380 |
svreinterpret_u32_u64(result_t)); |
44 |
|
7380 |
} else { |
45 |
|
|
if constexpr (std::is_same<ScalarType, uint8_t>::value) { |
46 |
|
1613158 |
svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); |
47 |
|
3226316 |
return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); |
48 |
|
1613158 |
} else { |
49 |
|
|
// Multiply by sizeof(uint16_t) |
50 |
|
66196 |
x = svlsl_x(pg, x, 1); |
51 |
|
66196 |
svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); |
52 |
|
132392 |
return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets); |
53 |
|
66196 |
} |
54 |
|
|
} |
55 |
|
|
} |
56 |
|
|
|
57 |
|
|
template <typename ScalarType, bool IsLarge> |
58 |
|
143792 |
svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y, |
59 |
|
|
svuint32_t sv_src_stride, |
60 |
|
|
Rows<const ScalarType> &src_rows, |
61 |
|
|
[[maybe_unused]] svuint8_t load_table) { |
62 |
|
|
if constexpr (IsLarge) { |
63 |
|
3600 |
svbool_t pg_b = pg; |
64 |
|
3600 |
svbool_t pg_t = svtrn2_b32(pg, svpfalse()); |
65 |
|
|
|
66 |
|
|
// Calculate offsets from coordinates (y * stride + x) |
67 |
|
|
// To avoid losing precision, the final offsets should be in 64 bits |
68 |
|
|
if constexpr (std::is_same<ScalarType, uint8_t>::value) { |
69 |
|
|
// Multiply x with the number of channels |
70 |
|
2400 |
svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); |
71 |
|
2400 |
svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); |
72 |
|
|
// Copy pixels from source |
73 |
|
4800 |
svuint64_t b = svld1uh_gather_offset_u64( |
74 |
|
2400 |
pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b); |
75 |
|
4800 |
svuint64_t t = svld1uh_gather_offset_u64( |
76 |
|
2400 |
pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t); |
77 |
|
4800 |
svuint32_t r32 = |
78 |
|
2400 |
svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t)); |
79 |
|
4800 |
return svreinterpret_u32_u8( |
80 |
|
2400 |
svtbl_u8(svreinterpret_u8_u32(r32), load_table)); |
81 |
|
2400 |
} |
82 |
|
|
if constexpr (std::is_same<ScalarType, uint16_t>::value) { |
83 |
|
|
// Multiply x with the number of channels and sizeof(uint16_t) |
84 |
|
1200 |
svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride); |
85 |
|
1200 |
svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride); |
86 |
|
|
// Copy pixels from source |
87 |
|
2400 |
svuint64_t result_b = svld1uw_gather_offset_u64( |
88 |
|
1200 |
pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b); |
89 |
|
2400 |
svuint64_t result_t = svld1uw_gather_offset_u64( |
90 |
|
1200 |
pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t); |
91 |
|
3600 |
return svtrn1_u32(svreinterpret_u32_u64(result_b), |
92 |
|
1200 |
svreinterpret_u32_u64(result_t)); |
93 |
|
1200 |
} |
94 |
|
3600 |
} else { |
95 |
|
|
// Multiply x with the number of channels and sizeof(ScalarType) |
96 |
|
|
// This shifting formula is only correct for 8 and 16 bits |
97 |
|
140192 |
x = svlsl_n_u32_x(pg, x, sizeof(ScalarType)); |
98 |
|
140192 |
svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); |
99 |
|
|
if constexpr (std::is_same<ScalarType, uint8_t>::value) { |
100 |
|
145592 |
svuint32_t r32 = svld1uh_gather_offset_u32( |
101 |
|
72796 |
pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets); |
102 |
|
145592 |
return svreinterpret_u32_u8( |
103 |
|
72796 |
svtbl_u8(svreinterpret_u8_u32(r32), load_table)); |
104 |
|
72796 |
} |
105 |
|
|
if constexpr (std::is_same<ScalarType, uint16_t>::value) { |
106 |
|
134792 |
return svld1_gather_u32offset_u32( |
107 |
|
67396 |
pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets); |
108 |
|
|
} |
109 |
|
140192 |
} |
110 |
|
|
} |
111 |
|
|
|
112 |
|
|
template <typename ScalarType, bool IsLarge, size_t Channels> |
113 |
|
247686 |
svuint32_t inline calculate_linear_replicated_border( |
114 |
|
|
svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, |
115 |
|
|
svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows, |
116 |
|
|
svuint8_t load_table_2ch) { |
117 |
|
247686 |
svbool_t pg_all32 = svptrue_b32(); |
118 |
|
|
|
119 |
|
1238430 |
auto load_source = [&](svuint32_t x, svuint32_t y) { |
120 |
|
|
if constexpr (Channels == 1) { |
121 |
|
920824 |
return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows); |
122 |
|
|
} |
123 |
|
|
if constexpr (Channels == 2) { |
124 |
|
139840 |
return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows, |
125 |
|
69920 |
load_table_2ch); |
126 |
|
|
} |
127 |
|
|
}; |
128 |
|
|
|
129 |
|
247686 |
svfloat32_t xf = svget2(coords, 0); |
130 |
|
247686 |
svfloat32_t yf = svget2(coords, 1); |
131 |
|
|
// Take the integer part, clamp it to within the dimensions of the |
132 |
|
|
// source image (negative values are already saturated to 0) |
133 |
|
247686 |
svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf)); |
134 |
|
247686 |
svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf)); |
135 |
|
|
|
136 |
|
|
// Get fractional part, or 0 if out of range |
137 |
|
495372 |
svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F), |
138 |
|
247686 |
svcmplt_f32(pg_all32, xf, xmaxf)); |
139 |
|
495372 |
svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F), |
140 |
|
247686 |
svcmplt_f32(pg_all32, yf, ymaxf)); |
141 |
|
495372 |
svfloat32_t xfrac = |
142 |
|
495372 |
svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)), |
143 |
|
247686 |
svdup_n_f32(0.F)); |
144 |
|
495372 |
svfloat32_t yfrac = |
145 |
|
495372 |
svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)), |
146 |
|
247686 |
svdup_n_f32(0.F)); |
147 |
|
|
|
148 |
|
|
// x1 = x0 + 1, except if it's already xmax or out of range |
149 |
|
247686 |
svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0); |
150 |
|
247686 |
svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0); |
151 |
|
|
|
152 |
|
512852 |
auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, |
153 |
|
|
svuint32_t di) { |
154 |
|
265166 |
svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); |
155 |
|
265166 |
svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); |
156 |
|
530332 |
svfloat32_t line0 = |
157 |
|
265166 |
svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); |
158 |
|
265166 |
svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); |
159 |
|
265166 |
svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); |
160 |
|
530332 |
svfloat32_t line1 = |
161 |
|
265166 |
svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); |
162 |
|
530332 |
svfloat32_t result = svmla_f32_x( |
163 |
|
265166 |
pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); |
164 |
|
530332 |
return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); |
165 |
|
265166 |
}; |
166 |
|
|
|
167 |
|
|
// Calculate offsets from coordinates (y * stride + x) |
168 |
|
|
// a: top left, b: top right, c: bottom left, d: bottom right |
169 |
|
247686 |
svuint32_t a = load_source(x0, y0); |
170 |
|
247686 |
svuint32_t b = load_source(x1, y0); |
171 |
|
247686 |
svuint32_t c = load_source(x0, y1); |
172 |
|
247686 |
svuint32_t d = load_source(x1, y1); |
173 |
|
|
if constexpr (Channels == 1) { |
174 |
|
460412 |
return lerp_2d(a, b, c, d); |
175 |
|
|
} |
176 |
|
|
if constexpr (Channels == 2) { |
177 |
|
|
// Channel 0 |
178 |
|
34960 |
svuint32_t res32_0 = lerp_2d( |
179 |
|
17480 |
svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), |
180 |
|
17480 |
svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); |
181 |
|
|
// Channel 1 |
182 |
|
34960 |
svuint32_t res32_1 = lerp_2d( |
183 |
|
17480 |
svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), |
184 |
|
17480 |
svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); |
185 |
|
|
|
186 |
|
52440 |
return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), |
187 |
|
17480 |
svreinterpret_u16_u32(res32_1))); |
188 |
|
17480 |
} |
189 |
|
247686 |
} |
190 |
|
|
|
191 |
|
|
template <typename ScalarType, bool IsLarge> |
192 |
|
473152 |
svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y, |
193 |
|
|
svuint32_t sv_border, svuint32_t sv_xmax, |
194 |
|
|
svuint32_t sv_ymax, svuint32_t sv_src_stride, |
195 |
|
|
Rows<const ScalarType> &src_rows) { |
196 |
|
946304 |
svbool_t in_range = |
197 |
|
473152 |
svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); |
198 |
|
946304 |
svuint32_t result = |
199 |
|
473152 |
load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows); |
200 |
|
|
// Select between source pixels and border colour |
201 |
|
946304 |
return svsel_u32(in_range, result, sv_border); |
202 |
|
473152 |
} |
203 |
|
|
|
204 |
|
|
template <typename ScalarType, bool IsLarge> |
205 |
|
69920 |
svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y, |
206 |
|
|
svuint32_t sv_border, svuint32_t sv_xmax, |
207 |
|
|
svuint32_t sv_ymax, |
208 |
|
|
svuint32_t sv_src_stride, |
209 |
|
|
Rows<const ScalarType> &src_rows, |
210 |
|
|
svuint8_t load_table) { |
211 |
|
139840 |
svbool_t in_range = |
212 |
|
69920 |
svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); |
213 |
|
139840 |
svuint32_t result = load_xy_2ch<ScalarType, IsLarge>( |
214 |
|
69920 |
in_range, x, y, sv_src_stride, src_rows, load_table); |
215 |
|
|
// Select between source pixels and border colour |
216 |
|
139840 |
return svsel_u32(in_range, result, sv_border); |
217 |
|
69920 |
} |
218 |
|
|
|
219 |
|
|
template <typename ScalarType, bool IsLarge, size_t Channels> |
220 |
|
135768 |
svuint32_t inline calculate_linear_constant_border( |
221 |
|
|
svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax, |
222 |
|
|
svuint32_t sv_ymax, svuint32_t sv_src_stride, |
223 |
|
|
Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) { |
224 |
|
135768 |
svbool_t pg_all32 = svptrue_b32(); |
225 |
|
|
|
226 |
|
678840 |
auto load_source = [&](svuint32_t x, svuint32_t y) { |
227 |
|
|
if constexpr (Channels == 1) { |
228 |
|
473152 |
return get_pixels_or_border<ScalarType, IsLarge>( |
229 |
|
473152 |
pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); |
230 |
|
|
} |
231 |
|
|
if constexpr (Channels == 2) { |
232 |
|
69920 |
return get_pixels_or_border_2ch<ScalarType, IsLarge>( |
233 |
|
69920 |
pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, |
234 |
|
69920 |
load_table_2ch); |
235 |
|
|
} |
236 |
|
|
}; |
237 |
|
|
|
238 |
|
|
// Convert coordinates to integers, truncating towards minus infinity. |
239 |
|
|
// Negative numbers will become large positive numbers. |
240 |
|
|
// Since the source width and height is known to be <=2^24 these large |
241 |
|
|
// positive numbers will always be treated as outside the source image |
242 |
|
|
// bounds. |
243 |
|
135768 |
svuint32_t x0, y0, x1, y1; |
244 |
|
135768 |
svfloat32_t xfrac, yfrac; |
245 |
|
|
{ |
246 |
|
135768 |
svfloat32_t xf = svget2(coords, 0); |
247 |
|
135768 |
svfloat32_t yf = svget2(coords, 1); |
248 |
|
135768 |
svfloat32_t xf0 = svrintm_f32_x(pg, xf); |
249 |
|
135768 |
svfloat32_t yf0 = svrintm_f32_x(pg, yf); |
250 |
|
135768 |
x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0)); |
251 |
|
135768 |
y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0)); |
252 |
|
135768 |
x1 = svadd_u32_x(pg, x0, svdup_n_u32(1)); |
253 |
|
135768 |
y1 = svadd_u32_x(pg, y0, svdup_n_u32(1)); |
254 |
|
|
|
255 |
|
135768 |
xfrac = svsub_f32_x(pg, xf, xf0); |
256 |
|
135768 |
yfrac = svsub_f32_x(pg, yf, yf0); |
257 |
|
135768 |
} |
258 |
|
|
|
259 |
|
289016 |
auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, |
260 |
|
|
svuint32_t di) { |
261 |
|
153248 |
svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); |
262 |
|
153248 |
svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); |
263 |
|
306496 |
svfloat32_t line0 = |
264 |
|
153248 |
svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); |
265 |
|
153248 |
svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); |
266 |
|
153248 |
svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); |
267 |
|
306496 |
svfloat32_t line1 = |
268 |
|
153248 |
svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); |
269 |
|
306496 |
svfloat32_t result = svmla_f32_x( |
270 |
|
153248 |
pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); |
271 |
|
306496 |
return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); |
272 |
|
153248 |
}; |
273 |
|
|
|
274 |
|
|
// Calculate offsets from coordinates (y * stride + x) |
275 |
|
|
// a: top left, b: top right, c: bottom left, d: bottom right |
276 |
|
135768 |
svuint32_t a = load_source(x0, y0); |
277 |
|
135768 |
svuint32_t b = load_source(x1, y0); |
278 |
|
135768 |
svuint32_t c = load_source(x0, y1); |
279 |
|
135768 |
svuint32_t d = load_source(x1, y1); |
280 |
|
|
if constexpr (Channels == 1) { |
281 |
|
236576 |
return lerp_2d(a, b, c, d); |
282 |
|
|
} |
283 |
|
|
if constexpr (Channels == 2) { |
284 |
|
|
// Channel 0 |
285 |
|
34960 |
svuint32_t res32_0 = lerp_2d( |
286 |
|
17480 |
svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), |
287 |
|
17480 |
svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); |
288 |
|
|
// Channel 1 |
289 |
|
34960 |
svuint32_t res32_1 = lerp_2d( |
290 |
|
17480 |
svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), |
291 |
|
17480 |
svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); |
292 |
|
|
|
293 |
|
52440 |
return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), |
294 |
|
17480 |
svreinterpret_u16_u32(res32_1))); |
295 |
|
17480 |
} |
296 |
|
135768 |
} |
297 |
|
|
|
298 |
|
|
} // namespace kleidicv::sve2 |
299 |
|
|
|