KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/transform_sve2.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 171 171 100.0%
Functions: 0 0 -%
Branches: 0 0 -%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cmath>
7 #include <cstddef>
8 #include <cstdint>
9
10 #include "kleidicv/sve2.h"
11 #include "kleidicv/types.h"
12 #include "transform_common.h"
13
14 namespace kleidicv::sve2 {
15
16 template <typename ScalarType, bool IsLarge>
17 1686734 svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y,
18 svuint32_t sv_src_stride,
19 Rows<const ScalarType> &src_rows) {
20 if constexpr (IsLarge) {
21 7380 svbool_t pg_b = pg;
22 7380 svbool_t pg_t = svtrn2_b32(pg, svpfalse());
23
24 // Calculate offsets from coordinates (y * stride + x)
25 // To avoid losing precision, the final offsets should be in 64 bits
26 7380 svuint64_t result_b, result_t;
27 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
28 6180 svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
29 6180 svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
30 // Copy pixels from source
31 6180 result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
32 6180 result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
33 6180 }
34 if constexpr (std::is_same<ScalarType, uint16_t>::value) {
35 // Multiply x with sizeof(uint16_t)
36 1200 svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
37 1200 svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
38 // Copy pixels from source
39 1200 result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
40 1200 result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
41 1200 }
42 22140 return svtrn1_u32(svreinterpret_u32_u64(result_b),
43 7380 svreinterpret_u32_u64(result_t));
44 7380 } else {
45 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
46 1613158 svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
47 3226316 return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
48 1613158 } else {
49 // Multiply by sizeof(uint16_t)
50 66196 x = svlsl_x(pg, x, 1);
51 66196 svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
52 132392 return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets);
53 66196 }
54 }
55 }
56
57 template <typename ScalarType, bool IsLarge>
58 143792 svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
59 svuint32_t sv_src_stride,
60 Rows<const ScalarType> &src_rows,
61 [[maybe_unused]] svuint8_t load_table) {
62 if constexpr (IsLarge) {
63 3600 svbool_t pg_b = pg;
64 3600 svbool_t pg_t = svtrn2_b32(pg, svpfalse());
65
66 // Calculate offsets from coordinates (y * stride + x)
67 // To avoid losing precision, the final offsets should be in 64 bits
68 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
69 // Multiply x with the number of channels
70 2400 svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
71 2400 svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
72 // Copy pixels from source
73 4800 svuint64_t b = svld1uh_gather_offset_u64(
74 2400 pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b);
75 4800 svuint64_t t = svld1uh_gather_offset_u64(
76 2400 pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t);
77 4800 svuint32_t r32 =
78 2400 svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t));
79 4800 return svreinterpret_u32_u8(
80 2400 svtbl_u8(svreinterpret_u8_u32(r32), load_table));
81 2400 }
82 if constexpr (std::is_same<ScalarType, uint16_t>::value) {
83 // Multiply x with the number of channels and sizeof(uint16_t)
84 1200 svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride);
85 1200 svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride);
86 // Copy pixels from source
87 2400 svuint64_t result_b = svld1uw_gather_offset_u64(
88 1200 pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b);
89 2400 svuint64_t result_t = svld1uw_gather_offset_u64(
90 1200 pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t);
91 3600 return svtrn1_u32(svreinterpret_u32_u64(result_b),
92 1200 svreinterpret_u32_u64(result_t));
93 1200 }
94 3600 } else {
95 // Multiply x with the number of channels and sizeof(ScalarType)
96 // This shifting formula is only correct for 8 and 16 bits
97 140192 x = svlsl_n_u32_x(pg, x, sizeof(ScalarType));
98 140192 svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
99 if constexpr (std::is_same<ScalarType, uint8_t>::value) {
100 145592 svuint32_t r32 = svld1uh_gather_offset_u32(
101 72796 pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets);
102 145592 return svreinterpret_u32_u8(
103 72796 svtbl_u8(svreinterpret_u8_u32(r32), load_table));
104 72796 }
105 if constexpr (std::is_same<ScalarType, uint16_t>::value) {
106 134792 return svld1_gather_u32offset_u32(
107 67396 pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets);
108 }
109 140192 }
110 }
111
112 template <typename ScalarType, bool IsLarge, size_t Channels>
113 247686 svuint32_t inline calculate_linear_replicated_border(
114 svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf,
115 svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows,
116 svuint8_t load_table_2ch) {
117 247686 svbool_t pg_all32 = svptrue_b32();
118
119 1238430 auto load_source = [&](svuint32_t x, svuint32_t y) {
120 if constexpr (Channels == 1) {
121 920824 return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
122 }
123 if constexpr (Channels == 2) {
124 139840 return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows,
125 69920 load_table_2ch);
126 }
127 };
128
129 247686 svfloat32_t xf = svget2(coords, 0);
130 247686 svfloat32_t yf = svget2(coords, 1);
131 // Take the integer part, clamp it to within the dimensions of the
132 // source image (negative values are already saturated to 0)
133 247686 svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
134 247686 svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
135
136 // Get fractional part, or 0 if out of range
137 495372 svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
138 247686 svcmplt_f32(pg_all32, xf, xmaxf));
139 495372 svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
140 247686 svcmplt_f32(pg_all32, yf, ymaxf));
141 495372 svfloat32_t xfrac =
142 495372 svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
143 247686 svdup_n_f32(0.F));
144 495372 svfloat32_t yfrac =
145 495372 svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
146 247686 svdup_n_f32(0.F));
147
148 // x1 = x0 + 1, except if it's already xmax or out of range
149 247686 svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
150 247686 svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
151
152 512852 auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
153 svuint32_t di) {
154 265166 svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
155 265166 svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
156 530332 svfloat32_t line0 =
157 265166 svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
158 265166 svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
159 265166 svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
160 530332 svfloat32_t line1 =
161 265166 svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
162 530332 svfloat32_t result = svmla_f32_x(
163 265166 pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
164 530332 return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
165 265166 };
166
167 // Calculate offsets from coordinates (y * stride + x)
168 // a: top left, b: top right, c: bottom left, d: bottom right
169 247686 svuint32_t a = load_source(x0, y0);
170 247686 svuint32_t b = load_source(x1, y0);
171 247686 svuint32_t c = load_source(x0, y1);
172 247686 svuint32_t d = load_source(x1, y1);
173 if constexpr (Channels == 1) {
174 460412 return lerp_2d(a, b, c, d);
175 }
176 if constexpr (Channels == 2) {
177 // Channel 0
178 34960 svuint32_t res32_0 = lerp_2d(
179 17480 svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
180 17480 svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
181 // Channel 1
182 34960 svuint32_t res32_1 = lerp_2d(
183 17480 svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
184 17480 svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
185
186 52440 return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
187 17480 svreinterpret_u16_u32(res32_1)));
188 17480 }
189 247686 }
190
191 template <typename ScalarType, bool IsLarge>
192 473152 svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
193 svuint32_t sv_border, svuint32_t sv_xmax,
194 svuint32_t sv_ymax, svuint32_t sv_src_stride,
195 Rows<const ScalarType> &src_rows) {
196 946304 svbool_t in_range =
197 473152 svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
198 946304 svuint32_t result =
199 473152 load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
200 // Select between source pixels and border colour
201 946304 return svsel_u32(in_range, result, sv_border);
202 473152 }
203
204 template <typename ScalarType, bool IsLarge>
205 69920 svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
206 svuint32_t sv_border, svuint32_t sv_xmax,
207 svuint32_t sv_ymax,
208 svuint32_t sv_src_stride,
209 Rows<const ScalarType> &src_rows,
210 svuint8_t load_table) {
211 139840 svbool_t in_range =
212 69920 svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
213 139840 svuint32_t result = load_xy_2ch<ScalarType, IsLarge>(
214 69920 in_range, x, y, sv_src_stride, src_rows, load_table);
215 // Select between source pixels and border colour
216 139840 return svsel_u32(in_range, result, sv_border);
217 69920 }
218
219 template <typename ScalarType, bool IsLarge, size_t Channels>
220 135768 svuint32_t inline calculate_linear_constant_border(
221 svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
222 svuint32_t sv_ymax, svuint32_t sv_src_stride,
223 Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) {
224 135768 svbool_t pg_all32 = svptrue_b32();
225
226 678840 auto load_source = [&](svuint32_t x, svuint32_t y) {
227 if constexpr (Channels == 1) {
228 473152 return get_pixels_or_border<ScalarType, IsLarge>(
229 473152 pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
230 }
231 if constexpr (Channels == 2) {
232 69920 return get_pixels_or_border_2ch<ScalarType, IsLarge>(
233 69920 pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
234 69920 load_table_2ch);
235 }
236 };
237
238 // Convert coordinates to integers, truncating towards minus infinity.
239 // Negative numbers will become large positive numbers.
240 // Since the source width and height is known to be <=2^24 these large
241 // positive numbers will always be treated as outside the source image
242 // bounds.
243 135768 svuint32_t x0, y0, x1, y1;
244 135768 svfloat32_t xfrac, yfrac;
245 {
246 135768 svfloat32_t xf = svget2(coords, 0);
247 135768 svfloat32_t yf = svget2(coords, 1);
248 135768 svfloat32_t xf0 = svrintm_f32_x(pg, xf);
249 135768 svfloat32_t yf0 = svrintm_f32_x(pg, yf);
250 135768 x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0));
251 135768 y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0));
252 135768 x1 = svadd_u32_x(pg, x0, svdup_n_u32(1));
253 135768 y1 = svadd_u32_x(pg, y0, svdup_n_u32(1));
254
255 135768 xfrac = svsub_f32_x(pg, xf, xf0);
256 135768 yfrac = svsub_f32_x(pg, yf, yf0);
257 135768 }
258
259 289016 auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
260 svuint32_t di) {
261 153248 svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
262 153248 svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
263 306496 svfloat32_t line0 =
264 153248 svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
265 153248 svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
266 153248 svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
267 306496 svfloat32_t line1 =
268 153248 svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
269 306496 svfloat32_t result = svmla_f32_x(
270 153248 pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
271 306496 return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
272 153248 };
273
274 // Calculate offsets from coordinates (y * stride + x)
275 // a: top left, b: top right, c: bottom left, d: bottom right
276 135768 svuint32_t a = load_source(x0, y0);
277 135768 svuint32_t b = load_source(x1, y0);
278 135768 svuint32_t c = load_source(x0, y1);
279 135768 svuint32_t d = load_source(x1, y1);
280 if constexpr (Channels == 1) {
281 236576 return lerp_2d(a, b, c, d);
282 }
283 if constexpr (Channels == 2) {
284 // Channel 0
285 34960 svuint32_t res32_0 = lerp_2d(
286 17480 svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
287 17480 svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
288 // Channel 1
289 34960 svuint32_t res32_1 = lerp_2d(
290 17480 svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
291 17480 svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
292
293 52440 return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
294 17480 svreinterpret_u16_u32(res32_1)));
295 17480 }
296 135768 }
297
298 } // namespace kleidicv::sve2
299