KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/warp_perspective_sve2.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 162 162 100.0%
Functions: 71 141 50.4%
Branches: 54 72 75.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cmath>
7
8 #include "kleidicv/ctypes.h"
9 #include "kleidicv/sve2.h"
10 #include "kleidicv/types.h"
11 #include "transform_sve2.h"
12
13 namespace kleidicv::sve2 {
14
15 // Template for WarpPerspective transformation.
16 // Destination pixels are filled from the source, by taking pixels using the
17 // transformed coordinates that are calculated as follows:
18 //
19 // [ T0, T1, T2 ] [ x ]
20 // (x',y',w') = [ T3, T4, T5 ] * [ y ]
21 // [ T6, T7, T8 ] [ 1 ]
22 // then
23 //
24 // xt = x' / w'
25 // yt = y' / w'
26 //
27 // or putting it together:
28 //
29 // xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
30 // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
31 //
32
33 template <typename ScalarType, bool IsLarge,
34 kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
35 size_t Channels>
36 1650 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
37 size_t src_height, const ScalarType *border_value,
38 Rows<ScalarType> dst_rows, size_t dst_width,
39 size_t y_begin, size_t y_end,
40 const float transform[9]) {
41 1650 svbool_t pg_all32 = svptrue_b32();
42 1650 svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
43 1650 svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
44 1650 svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
45 1650 svuint32_t sv_border;
46 // sv_border is only used if the border type is constant.
47 // If the border type is not constant then border_value is permitted to be
48 // null and must not be read.
49 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
50 444 sv_border = svdup_n_u32(border_value[0]);
51 }
52
53 1650 svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
54 1650 svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
55
56 1650 const size_t kStep = VecTraits<float>::num_lanes();
57
58 1650 svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
59 1650 svfloat32_t T0 = svdup_n_f32(transform[0]);
60 1650 svfloat32_t T3 = svdup_n_f32(transform[3]);
61 1650 svfloat32_t T6 = svdup_n_f32(transform[6]);
62 1650 svfloat32_t tx0, ty0, tw0;
63
64 605802 auto calc_coords = [&](svbool_t, size_t x) {
65 604152 svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
66 // Calculate half-transformed values from the first few pixel values,
67 // plus Tn*x, similarly to the one above
68 // Calculate inverse weight because division is expensive
69 1208304 svfloat32_t iw =
70 604152 svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6));
71 604152 svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0);
72 604152 svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
73
74 // Calculate coordinates into the source image
75 1812456 return svcreate2(svmul_f32_x(pg_all32, tx, iw),
76 604152 svmul_f32_x(pg_all32, ty, iw));
77 604152 };
78
79 291362 auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
80 289712 svfloat32x2_t coords = calc_coords(pg32, x);
81 289712 svfloat32_t xf = svget2(coords, 0);
82 289712 svfloat32_t yf = svget2(coords, 1);
83
84 289712 svuint32_t xi, yi;
85 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
86 // Round to the nearest integer
87 92942 xi = svreinterpret_u32_s32(
88 92942 svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
89 92942 yi = svreinterpret_u32_s32(
90 92942 svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
91 } else {
92 // Round to the nearest integer, clamp it to within the dimensions of the
93 // source image (negative values are already saturated to 0)
94 393540 xi = svmin_x(pg_all32,
95 196770 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
96 196770 sv_xmax);
97 393540 yi = svmin_x(pg_all32,
98 196770 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
99 196770 sv_ymax);
100 }
101 579424 return svcreate2(xi, yi);
102 289712 };
103
104 94592 auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
105 185884 svbool_t in_range =
106 92942 svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
107 185884 svuint32_t result =
108 92942 load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
109 // Select between source pixels and border color
110 185884 return svsel_u32(in_range, result, sv_border);
111 92942 };
112
113 69414 auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
114 338820 auto load_source = [&](svuint32x2_t coords) {
115 271056 svuint32_t x = svget2(coords, 0);
116 271056 svuint32_t y = svget2(coords, 1);
117 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
118 178784 return get_pixels_or_border(pg_all32, x, y);
119 } else {
120 544992 return load_xy<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride,
121 181664 src_rows);
122 }
123 271056 };
124 67764 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
125 135528 svuint32_t res32_0 =
126 67764 load_source(calculate_nearest_coordinates(pg_all32, x));
127 67764 x += kStep;
128 135528 svuint32_t res32_1 =
129 67764 load_source(calculate_nearest_coordinates(pg_all32, x));
130 135528 svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
131 67764 svreinterpret_u16_u32(res32_1));
132 67764 x += kStep;
133 67764 res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x));
134 67764 x += kStep;
135 67764 res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x));
136 135528 svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
137 67764 svreinterpret_u16_u32(res32_1));
138 135528 svuint8_t result =
139 67764 svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1));
140 67764 svst1(svptrue_b8(), p_dst, result);
141 67764 };
142
143 20306 auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
144 Columns<ScalarType> dst) {
145 18656 size_t length = x_max - x;
146 18656 svbool_t pg32 = svwhilelt_b32(0ULL, length);
147
148 18656 svuint32x2_t coords = calculate_nearest_coordinates(pg32, x);
149 18656 svuint32_t xi = svget2(coords, 0);
150 18656 svuint32_t yi = svget2(coords, 1);
151
152 18656 svuint32_t result;
153 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
154 3550 result = get_pixels_or_border(pg32, xi, yi);
155 } else {
156 15106 result =
157 15106 load_xy<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride, src_rows);
158 }
159 18656 svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
160 18656 };
161
162 // WarpPerspective does not implement 2 channels, so this is dummy
163 1650 svuint8_t dummy_load_table_2ch{};
164
165 316090 auto calculate_linear = [&](svbool_t pg, uint32_t x) {
166 314440 svfloat32x2_t coords = calc_coords(pg, x);
167 if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
168 426544 return calculate_linear_replicated_border<ScalarType, IsLarge, 1>(
169 213272 pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows,
170 213272 dummy_load_table_2ch);
171 } else {
172 static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
173 202336 return calculate_linear_constant_border<ScalarType, IsLarge, 1>(
174 101168 pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
175 101168 dummy_load_table_2ch);
176 }
177 314440 };
178
179 18702 auto process_row = [&](size_t y) {
180 17052 float fy = static_cast<float>(y);
181 // Calculate half-transformed values at the first pixel (nominators)
182 // tw = T6*x + T7*y + T8
183 // tx = (T0*x + T1*y + T2) / tw
184 // ty = (T3*x + T4*y + T5) / tw
185 17052 tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2]));
186 17052 ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
187 17052 tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
188
189 17052 Columns<ScalarType> dst = dst_rows.as_columns();
190 17052 LoopUnroll2 loop{dst_width, kStep};
191 if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
192 73864 loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
193 6100 loop.unroll_once(
194 19726 [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); });
195 11130 loop.remaining([&](size_t x, size_t length) {
196 5030 vector_path_nearest_tail(x, length, dst);
197 5030 });
198 } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
199 83622 loop.unroll_four_times([&](size_t x) {
200 72670 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
201 72670 svuint32_t res0 = calculate_linear(pg_all32, x);
202 72670 x += kStep;
203 72670 svuint32_t res1 = calculate_linear(pg_all32, x);
204 145340 svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
205 72670 svreinterpret_u16_u32(res1));
206 72670 x += kStep;
207 72670 res0 = calculate_linear(pg_all32, x);
208 72670 x += kStep;
209 72670 res1 = calculate_linear(pg_all32, x);
210 145340 svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
211 72670 svreinterpret_u16_u32(res1));
212 145340 svst1_u8(svptrue_b8(), p_dst,
213 145340 svuzp1_u8(svreinterpret_u8_u16(result16_0),
214 72670 svreinterpret_u8_u16(result16_1)));
215 72670 });
216 24542 loop.unroll_once([&](size_t x) {
217 13590 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
218 13590 svuint32_t result = calculate_linear(pg_all32, x);
219 13590 svst1b_u32(pg_all32, p_dst, result);
220 13590 });
221 21122 loop.remaining([&](size_t x, size_t x_max) {
222 10170 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
223 10170 svbool_t pg32 = svwhilelt_b32_u64(x, x_max);
224 10170 svuint32_t result = calculate_linear(pg32, x);
225 10170 svst1b_u32(pg32, p_dst, result);
226 10170 });
227 } else {
228 static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
229 Inter == KLEIDICV_INTERPOLATION_LINEAR,
230 ": Unknown interpolation type!");
231 }
232 17052 };
233
234
16/32
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 42 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 84 times.
✗ Branch 10 not taken.
✗ Branch 11 not taken.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 42 times.
✗ Branch 14 not taken.
✗ Branch 15 not taken.
✓ Branch 16 taken 602 times.
✓ Branch 17 taken 4954 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
✓ Branch 20 taken 222 times.
✓ Branch 21 taken 1020 times.
✗ Branch 22 not taken.
✗ Branch 23 not taken.
✓ Branch 24 taken 596 times.
✓ Branch 25 taken 8192 times.
✗ Branch 26 not taken.
✗ Branch 27 not taken.
✓ Branch 28 taken 218 times.
✓ Branch 29 taken 2634 times.
✗ Branch 30 not taken.
✗ Branch 31 not taken.
18702 for (size_t y = y_begin; y < y_end; ++y) {
235 17052 process_row(y);
236 17052 ++dst_rows;
237 17052 }
238 1650 }
239
240 template <typename T>
241 1676 KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe(
242 const T *src, size_t src_stride, size_t src_width, size_t src_height,
243 T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
244 size_t y_begin, size_t y_end, const float transform[9], size_t channels,
245 kleidicv_interpolation_type_t interpolation,
246 kleidicv_border_type_t border_type, const T *border_value) {
247
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1674 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1674 times.
1676 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
248
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1672 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1672 times.
1674 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
249
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1670 times.
1672 CHECK_POINTERS(transform);
250
6/6
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1668 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1666 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1666 times.
1670 CHECK_IMAGE_SIZE(src_width, src_height);
251
6/6
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1664 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1662 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1662 times.
1666 CHECK_IMAGE_SIZE(dst_width, dst_height);
252
4/4
✓ Branch 0 taken 446 times.
✓ Branch 1 taken 1216 times.
✓ Branch 2 taken 444 times.
✓ Branch 3 taken 2 times.
1662 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
253 2 return KLEIDICV_ERROR_NULL_POINTER;
254 }
255
256 // Calculating in float32_t will only be precise until 24 bits, and
257 // multiplication can only be done with 32x32 bits
258 // Empty source image is not supported
259
4/4
✓ Branch 0 taken 1658 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1656 times.
✓ Branch 3 taken 2 times.
1660 if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
260
4/4
✓ Branch 0 taken 1654 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1652 times.
✓ Branch 3 taken 2 times.
1656 dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
261
4/6
✓ Branch 0 taken 1650 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1650 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1650 times.
1652 src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) {
262 10 return KLEIDICV_ERROR_RANGE;
263 }
264
265 1650 Rows<const T> src_rows{src, src_stride, channels};
266 1650 Rows<T> dst_rows{dst, dst_stride, channels};
267 1650 Rectangle rect{dst_width, dst_height};
268
269 1650 dst_rows += y_begin;
270
271 1650 transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
272 border_type, channels, src_rows, src_width, src_height,
273 border_value, dst_rows, dst_width, y_begin, y_end,
274 transform);
275
276 1650 return KLEIDICV_OK;
277 1676 }
278
279 #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type) \
280 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \
281 warp_perspective_stripe<type>( \
282 const type *src, size_t src_stride, size_t src_width, size_t src_height, \
283 type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
284 size_t y_begin, size_t y_end, const float transformation[9], \
285 size_t channels, kleidicv_interpolation_type_t interpolation, \
286 kleidicv_border_type_t border_type, const type *border_value)
287
288 KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t);
289
290 } // namespace kleidicv::sve2
291