KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/warp_perspective_sve2.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 162 162 100.0%
Functions: 71 141 50.4%
Branches: 54 72 75.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cmath>
7
8 #include "kleidicv/ctypes.h"
9 #include "kleidicv/sve2.h"
10 #include "kleidicv/types.h"
11 #include "transform_sve2.h"
12
13 namespace kleidicv::sve2 {
14
15 // Template for WarpPerspective transformation.
16 // Destination pixels are filled from the source, by taking pixels using the
17 // transformed coordinates that are calculated as follows:
18 //
19 // [ T0, T1, T2 ] [ x ]
20 // (x',y',w') = [ T3, T4, T5 ] * [ y ]
21 // [ T6, T7, T8 ] [ 1 ]
22 // then
23 //
24 // xt = x' / w'
25 // yt = y' / w'
26 //
27 // or putting it together:
28 //
29 // xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8)
30 // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8)
31 //
32
33 template <typename ScalarType, bool IsLarge,
34 kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border,
35 size_t Channels>
36 1578 void transform_operation(Rows<const ScalarType> src_rows, size_t src_width,
37 size_t src_height, const ScalarType *border_value,
38 Rows<ScalarType> dst_rows, size_t dst_width,
39 size_t y_begin, size_t y_end,
40 const float transform[9]) {
41 1578 svbool_t pg_all32 = svptrue_b32();
42 1578 svuint32_t sv_xmax = svdup_n_u32(src_width - 1);
43 1578 svuint32_t sv_ymax = svdup_n_u32(src_height - 1);
44 1578 svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride());
45 1578 svuint32_t sv_border;
46 // sv_border is only used if the border type is constant.
47 // If the border type is not constant then border_value is permitted to be
48 // null and must not be read.
49 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
50 444 sv_border = svdup_n_u32(border_value[0]);
51 }
52
53 1578 svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1));
54 1578 svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1));
55
56 1578 const size_t kStep = VecTraits<float>::num_lanes();
57
58 1578 svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1));
59 1578 svfloat32_t T0 = svdup_n_f32(transform[0]);
60 1578 svfloat32_t T3 = svdup_n_f32(transform[3]);
61 1578 svfloat32_t T6 = svdup_n_f32(transform[6]);
62 1578 svfloat32_t tx0, ty0, tw0;
63
64 605358 auto calc_coords = [&](svbool_t, size_t x) {
65 603780 svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x));
66 // Calculate half-transformed values from the first few pixel values,
67 // plus Tn*x, similarly to the one above
68 // Calculate inverse weight because division is expensive
69 1207560 svfloat32_t iw =
70 603780 svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6));
71 603780 svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0);
72 603780 svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3);
73
74 // Calculate coordinates into the source image
75 1811340 return svcreate2(svmul_f32_x(pg_all32, tx, iw),
76 603780 svmul_f32_x(pg_all32, ty, iw));
77 603780 };
78
79 291104 auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) {
80 289526 svfloat32x2_t coords = calc_coords(pg32, x);
81 289526 svfloat32_t xf = svget2(coords, 0);
82 289526 svfloat32_t yf = svget2(coords, 1);
83
84 289526 svuint32_t xi, yi;
85 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
86 // Round to the nearest integer
87 92942 xi = svreinterpret_u32_s32(
88 92942 svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf)));
89 92942 yi = svreinterpret_u32_s32(
90 92942 svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf)));
91 } else {
92 // Round to the nearest integer, clamp it to within the dimensions of the
93 // source image (negative values are already saturated to 0)
94 393168 xi = svmin_x(pg_all32,
95 196584 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)),
96 196584 sv_xmax);
97 393168 yi = svmin_x(pg_all32,
98 196584 svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)),
99 196584 sv_ymax);
100 }
101 579052 return svcreate2(xi, yi);
102 289526 };
103
104 94520 auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) {
105 185884 svbool_t in_range =
106 92942 svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
107 185884 svuint32_t result =
108 92942 load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
109 // Select between source pixels and border colour
110 185884 return svsel_u32(in_range, result, sv_border);
111 92942 };
112
113 69312 auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) {
114 338670 auto load_source = [&](svuint32x2_t coords) {
115 270936 svuint32_t x = svget2(coords, 0);
116 270936 svuint32_t y = svget2(coords, 1);
117 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
118 178784 return get_pixels_or_border(pg_all32, x, y);
119 } else {
120 544632 return load_xy<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride,
121 181544 src_rows);
122 }
123 270936 };
124 67734 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
125 135468 svuint32_t res32_0 =
126 67734 load_source(calculate_nearest_coordinates(pg_all32, x));
127 67734 x += kStep;
128 135468 svuint32_t res32_1 =
129 67734 load_source(calculate_nearest_coordinates(pg_all32, x));
130 135468 svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
131 67734 svreinterpret_u16_u32(res32_1));
132 67734 x += kStep;
133 67734 res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x));
134 67734 x += kStep;
135 67734 res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x));
136 135468 svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0),
137 67734 svreinterpret_u16_u32(res32_1));
138 135468 svuint8_t result =
139 67734 svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1));
140 67734 svst1(svptrue_b8(), p_dst, result);
141 67734 };
142
143 20168 auto vector_path_nearest_tail = [&](size_t x, size_t x_max,
144 Columns<ScalarType> dst) {
145 18590 size_t length = x_max - x;
146 18590 svbool_t pg32 = svwhilelt_b32(0ULL, length);
147
148 18590 svuint32x2_t coords = calculate_nearest_coordinates(pg32, x);
149 18590 svuint32_t xi = svget2(coords, 0);
150 18590 svuint32_t yi = svget2(coords, 1);
151
152 18590 svuint32_t result;
153 if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) {
154 3550 result = get_pixels_or_border(pg32, xi, yi);
155 } else {
156 15040 result =
157 15040 load_xy<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride, src_rows);
158 }
159 18590 svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result);
160 18590 };
161
162 // WarpPerspective does not implement 2 channels, so this is dummy
163 1578 svuint8_t dummy_load_table_2ch{};
164
165 315832 auto calculate_linear = [&](svbool_t pg, uint32_t x) {
166 314254 svfloat32x2_t coords = calc_coords(pg, x);
167 if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) {
168 426172 return calculate_linear_replicated_border<ScalarType, IsLarge, 1>(
169 213086 pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows,
170 213086 dummy_load_table_2ch);
171 } else {
172 static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT);
173 202336 return calculate_linear_constant_border<ScalarType, IsLarge, 1>(
174 101168 pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
175 101168 dummy_load_table_2ch);
176 }
177 314254 };
178
179 18558 auto process_row = [&](size_t y) {
180 16980 float fy = static_cast<float>(y);
181 // Calculate half-transformed values at the first pixel (nominators)
182 // tw = T6*x + T7*y + T8
183 // tx = (T0*x + T1*y + T2) / tw
184 // ty = (T3*x + T4*y + T5) / tw
185 16980 tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2]));
186 16980 ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5]));
187 16980 tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8]));
188
189 16980 Columns<ScalarType> dst = dst_rows.as_columns();
190 16980 LoopUnroll2 loop{dst_width, kStep};
191 if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) {
192 73798 loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); });
193 6064 loop.unroll_once(
194 19640 [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); });
195 11078 loop.remaining([&](size_t x, size_t length) {
196 5014 vector_path_nearest_tail(x, length, dst);
197 5014 });
198 } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) {
199 83556 loop.unroll_four_times([&](size_t x) {
200 72640 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
201 72640 svuint32_t res0 = calculate_linear(pg_all32, x);
202 72640 x += kStep;
203 72640 svuint32_t res1 = calculate_linear(pg_all32, x);
204 145280 svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0),
205 72640 svreinterpret_u16_u32(res1));
206 72640 x += kStep;
207 72640 res0 = calculate_linear(pg_all32, x);
208 72640 x += kStep;
209 72640 res1 = calculate_linear(pg_all32, x);
210 145280 svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0),
211 72640 svreinterpret_u16_u32(res1));
212 145280 svst1_u8(svptrue_b8(), p_dst,
213 145280 svuzp1_u8(svreinterpret_u8_u16(result16_0),
214 72640 svreinterpret_u8_u16(result16_1)));
215 72640 });
216 24456 loop.unroll_once([&](size_t x) {
217 13540 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
218 13540 svuint32_t result = calculate_linear(pg_all32, x);
219 13540 svst1b_u32(pg_all32, p_dst, result);
220 13540 });
221 21070 loop.remaining([&](size_t x, size_t x_max) {
222 10154 ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)];
223 10154 svbool_t pg32 = svwhilelt_b32_u64(x, x_max);
224 10154 svuint32_t result = calculate_linear(pg32, x);
225 10154 svst1b_u32(pg32, p_dst, result);
226 10154 });
227 } else {
228 static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST ||
229 Inter == KLEIDICV_INTERPOLATION_LINEAR,
230 ": Unknown interpolation type!");
231 }
232 16980 };
233
234
16/32
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 42 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 84 times.
✗ Branch 10 not taken.
✗ Branch 11 not taken.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 42 times.
✗ Branch 14 not taken.
✗ Branch 15 not taken.
✓ Branch 16 taken 566 times.
✓ Branch 17 taken 4918 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
✓ Branch 20 taken 222 times.
✓ Branch 21 taken 1020 times.
✗ Branch 22 not taken.
✗ Branch 23 not taken.
✓ Branch 24 taken 560 times.
✓ Branch 25 taken 8156 times.
✗ Branch 26 not taken.
✗ Branch 27 not taken.
✓ Branch 28 taken 218 times.
✓ Branch 29 taken 2634 times.
✗ Branch 30 not taken.
✗ Branch 31 not taken.
18558 for (size_t y = y_begin; y < y_end; ++y) {
235 16980 process_row(y);
236 16980 ++dst_rows;
237 16980 }
238 1578 }
239
240 template <typename T>
241 1604 KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe(
242 const T *src, size_t src_stride, size_t src_width, size_t src_height,
243 T *dst, size_t dst_stride, size_t dst_width, size_t dst_height,
244 size_t y_begin, size_t y_end, const float transform[9], size_t channels,
245 kleidicv_interpolation_type_t interpolation,
246 kleidicv_border_type_t border_type, const T *border_value) {
247
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1602 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1602 times.
1604 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
248
4/4
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1600 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1600 times.
1602 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
249
2/2
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1598 times.
1600 CHECK_POINTERS(transform);
250
6/6
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1596 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1594 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1594 times.
1598 CHECK_IMAGE_SIZE(src_width, src_height);
251
6/6
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1592 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1590 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1590 times.
1594 CHECK_IMAGE_SIZE(dst_width, dst_height);
252
4/4
✓ Branch 0 taken 446 times.
✓ Branch 1 taken 1144 times.
✓ Branch 2 taken 444 times.
✓ Branch 3 taken 2 times.
1590 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
253 2 return KLEIDICV_ERROR_NULL_POINTER;
254 }
255
256 // Calculating in float32_t will only be precise until 24 bits, and
257 // multiplication can only be done with 32x32 bits
258 // Empty source image is not supported
259
4/4
✓ Branch 0 taken 1586 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1584 times.
✓ Branch 3 taken 2 times.
1588 if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) ||
260
4/4
✓ Branch 0 taken 1582 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1580 times.
✓ Branch 3 taken 2 times.
1584 dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) ||
261
4/6
✓ Branch 0 taken 1578 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1578 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1578 times.
1580 src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) {
262 10 return KLEIDICV_ERROR_RANGE;
263 }
264
265 1578 Rows<const T> src_rows{src, src_stride, channels};
266 1578 Rows<T> dst_rows{dst, dst_stride, channels};
267 1578 Rectangle rect{dst_width, dst_height};
268
269 1578 dst_rows += y_begin;
270
271 1578 transform_operation<T>(is_image_large(src_rows, src_height), interpolation,
272 border_type, channels, src_rows, src_width, src_height,
273 border_value, dst_rows, dst_width, y_begin, y_end,
274 transform);
275
276 1578 return KLEIDICV_OK;
277 1604 }
278
279 #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type) \
280 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \
281 warp_perspective_stripe<type>( \
282 const type *src, size_t src_stride, size_t src_width, size_t src_height, \
283 type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
284 size_t y_begin, size_t y_end, const float transformation[9], \
285 size_t channels, kleidicv_interpolation_type_t interpolation, \
286 kleidicv_border_type_t border_type, const type *border_value)
287
288 KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t);
289
290 } // namespace kleidicv::sve2
291