Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | #include <cmath> | ||
7 | |||
8 | #include "kleidicv/ctypes.h" | ||
9 | #include "kleidicv/sve2.h" | ||
10 | #include "kleidicv/types.h" | ||
11 | #include "transform_sve2.h" | ||
12 | |||
13 | namespace kleidicv::sve2 { | ||
14 | |||
15 | // Template for WarpPerspective transformation. | ||
16 | // Destination pixels are filled from the source, by taking pixels using the | ||
17 | // transformed coordinates that are calculated as follows: | ||
18 | // | ||
19 | // [ T0, T1, T2 ] [ x ] | ||
20 | // (x',y',w') = [ T3, T4, T5 ] * [ y ] | ||
21 | // [ T6, T7, T8 ] [ 1 ] | ||
22 | // then | ||
23 | // | ||
24 | // xt = x' / w' | ||
25 | // yt = y' / w' | ||
26 | // | ||
27 | // or putting it together: | ||
28 | // | ||
29 | // xt = (T0*x + T1*y + T2) / (T6*x + T7*y + T8) | ||
30 | // yt = (T3*x + T4*y + T5) / (T6*x + T7*y + T8) | ||
31 | // | ||
32 | |||
33 | template <typename ScalarType, bool IsLarge, | ||
34 | kleidicv_interpolation_type_t Inter, kleidicv_border_type_t Border, | ||
35 | size_t Channels> | ||
36 | 1578 | void transform_operation(Rows<const ScalarType> src_rows, size_t src_width, | |
37 | size_t src_height, const ScalarType *border_value, | ||
38 | Rows<ScalarType> dst_rows, size_t dst_width, | ||
39 | size_t y_begin, size_t y_end, | ||
40 | const float transform[9]) { | ||
41 | 1578 | svbool_t pg_all32 = svptrue_b32(); | |
42 | 1578 | svuint32_t sv_xmax = svdup_n_u32(src_width - 1); | |
43 | 1578 | svuint32_t sv_ymax = svdup_n_u32(src_height - 1); | |
44 | 1578 | svuint32_t sv_src_stride = svdup_n_u32(src_rows.stride()); | |
45 | 1578 | svuint32_t sv_border; | |
46 | // sv_border is only used if the border type is constant. | ||
47 | // If the border type is not constant then border_value is permitted to be | ||
48 | // null and must not be read. | ||
49 | if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { | ||
50 | 444 | sv_border = svdup_n_u32(border_value[0]); | |
51 | } | ||
52 | |||
53 | 1578 | svfloat32_t xmaxf = svdup_n_f32(static_cast<float>(src_width - 1)); | |
54 | 1578 | svfloat32_t ymaxf = svdup_n_f32(static_cast<float>(src_height - 1)); | |
55 | |||
56 | 1578 | const size_t kStep = VecTraits<float>::num_lanes(); | |
57 | |||
58 | 1578 | svfloat32_t sv_0123 = svcvt_f32_u32_z(pg_all32, svindex_u32(0, 1)); | |
59 | 1578 | svfloat32_t T0 = svdup_n_f32(transform[0]); | |
60 | 1578 | svfloat32_t T3 = svdup_n_f32(transform[3]); | |
61 | 1578 | svfloat32_t T6 = svdup_n_f32(transform[6]); | |
62 | 1578 | svfloat32_t tx0, ty0, tw0; | |
63 | |||
64 | 605358 | auto calc_coords = [&](svbool_t, size_t x) { | |
65 | 603780 | svfloat32_t vx = svadd_n_f32_x(pg_all32, sv_0123, static_cast<float>(x)); | |
66 | // Calculate half-transformed values from the first few pixel values, | ||
67 | // plus Tn*x, similarly to the one above | ||
68 | // Calculate inverse weight because division is expensive | ||
69 | 1207560 | svfloat32_t iw = | |
70 | 603780 | svdiv_f32_x(pg_all32, svdup_n_f32(1.F), svmla_x(pg_all32, tw0, vx, T6)); | |
71 | 603780 | svfloat32_t tx = svmla_x(pg_all32, tx0, vx, T0); | |
72 | 603780 | svfloat32_t ty = svmla_x(pg_all32, ty0, vx, T3); | |
73 | |||
74 | // Calculate coordinates into the source image | ||
75 | 1811340 | return svcreate2(svmul_f32_x(pg_all32, tx, iw), | |
76 | 603780 | svmul_f32_x(pg_all32, ty, iw)); | |
77 | 603780 | }; | |
78 | |||
79 | 291104 | auto calculate_nearest_coordinates = [&](svbool_t pg32, size_t x) { | |
80 | 289526 | svfloat32x2_t coords = calc_coords(pg32, x); | |
81 | 289526 | svfloat32_t xf = svget2(coords, 0); | |
82 | 289526 | svfloat32_t yf = svget2(coords, 1); | |
83 | |||
84 | 289526 | svuint32_t xi, yi; | |
85 | if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { | ||
86 | // Round to the nearest integer | ||
87 | 92942 | xi = svreinterpret_u32_s32( | |
88 | 92942 | svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, xf))); | |
89 | 92942 | yi = svreinterpret_u32_s32( | |
90 | 92942 | svcvt_s32_f32_x(pg_all32, svrinta_f32_x(pg_all32, yf))); | |
91 | } else { | ||
92 | // Round to the nearest integer, clamp it to within the dimensions of the | ||
93 | // source image (negative values are already saturated to 0) | ||
94 | 393168 | xi = svmin_x(pg_all32, | |
95 | 196584 | svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, xf, 0.5F)), | |
96 | 196584 | sv_xmax); | |
97 | 393168 | yi = svmin_x(pg_all32, | |
98 | 196584 | svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, yf, 0.5F)), | |
99 | 196584 | sv_ymax); | |
100 | } | ||
101 | 579052 | return svcreate2(xi, yi); | |
102 | 289526 | }; | |
103 | |||
104 | 94520 | auto get_pixels_or_border = [&](svbool_t pg, svuint32_t x, svuint32_t y) { | |
105 | 185884 | svbool_t in_range = | |
106 | 92942 | svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); | |
107 | 185884 | svuint32_t result = | |
108 | 92942 | load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows); | |
109 | // Select between source pixels and border colour | ||
110 | 185884 | return svsel_u32(in_range, result, sv_border); | |
111 | 92942 | }; | |
112 | |||
113 | 69312 | auto vector_path_nearest_4x = [&](size_t x, Columns<ScalarType> dst) { | |
114 | 338670 | auto load_source = [&](svuint32x2_t coords) { | |
115 | 270936 | svuint32_t x = svget2(coords, 0); | |
116 | 270936 | svuint32_t y = svget2(coords, 1); | |
117 | if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { | ||
118 | 178784 | return get_pixels_or_border(pg_all32, x, y); | |
119 | } else { | ||
120 | 544632 | return load_xy<ScalarType, IsLarge>(pg_all32, x, y, sv_src_stride, | |
121 | 181544 | src_rows); | |
122 | } | ||
123 | 270936 | }; | |
124 | 67734 | ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)]; | |
125 | 135468 | svuint32_t res32_0 = | |
126 | 67734 | load_source(calculate_nearest_coordinates(pg_all32, x)); | |
127 | 67734 | x += kStep; | |
128 | 135468 | svuint32_t res32_1 = | |
129 | 67734 | load_source(calculate_nearest_coordinates(pg_all32, x)); | |
130 | 135468 | svuint16_t result0 = svuzp1_u16(svreinterpret_u16_u32(res32_0), | |
131 | 67734 | svreinterpret_u16_u32(res32_1)); | |
132 | 67734 | x += kStep; | |
133 | 67734 | res32_0 = load_source(calculate_nearest_coordinates(pg_all32, x)); | |
134 | 67734 | x += kStep; | |
135 | 67734 | res32_1 = load_source(calculate_nearest_coordinates(pg_all32, x)); | |
136 | 135468 | svuint16_t result1 = svuzp1_u16(svreinterpret_u16_u32(res32_0), | |
137 | 67734 | svreinterpret_u16_u32(res32_1)); | |
138 | 135468 | svuint8_t result = | |
139 | 67734 | svuzp1_u8(svreinterpret_u8_u16(result0), svreinterpret_u8_u16(result1)); | |
140 | 67734 | svst1(svptrue_b8(), p_dst, result); | |
141 | 67734 | }; | |
142 | |||
143 | 20168 | auto vector_path_nearest_tail = [&](size_t x, size_t x_max, | |
144 | Columns<ScalarType> dst) { | ||
145 | 18590 | size_t length = x_max - x; | |
146 | 18590 | svbool_t pg32 = svwhilelt_b32(0ULL, length); | |
147 | |||
148 | 18590 | svuint32x2_t coords = calculate_nearest_coordinates(pg32, x); | |
149 | 18590 | svuint32_t xi = svget2(coords, 0); | |
150 | 18590 | svuint32_t yi = svget2(coords, 1); | |
151 | |||
152 | 18590 | svuint32_t result; | |
153 | if constexpr (Border == KLEIDICV_BORDER_TYPE_CONSTANT) { | ||
154 | 3550 | result = get_pixels_or_border(pg32, xi, yi); | |
155 | } else { | ||
156 | 15040 | result = | |
157 | 15040 | load_xy<ScalarType, IsLarge>(pg32, xi, yi, sv_src_stride, src_rows); | |
158 | } | ||
159 | 18590 | svst1b_u32(pg32, &dst[static_cast<ptrdiff_t>(x)], result); | |
160 | 18590 | }; | |
161 | |||
162 | // WarpPerspective does not implement 2 channels, so this is dummy | ||
163 | 1578 | svuint8_t dummy_load_table_2ch{}; | |
164 | |||
165 | 315832 | auto calculate_linear = [&](svbool_t pg, uint32_t x) { | |
166 | 314254 | svfloat32x2_t coords = calc_coords(pg, x); | |
167 | if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { | ||
168 | 426172 | return calculate_linear_replicated_border<ScalarType, IsLarge, 1>( | |
169 | 213086 | pg, coords, xmaxf, ymaxf, sv_src_stride, src_rows, | |
170 | 213086 | dummy_load_table_2ch); | |
171 | } else { | ||
172 | static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); | ||
173 | 202336 | return calculate_linear_constant_border<ScalarType, IsLarge, 1>( | |
174 | 101168 | pg, coords, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, | |
175 | 101168 | dummy_load_table_2ch); | |
176 | } | ||
177 | 314254 | }; | |
178 | |||
179 | 18558 | auto process_row = [&](size_t y) { | |
180 | 16980 | float fy = static_cast<float>(y); | |
181 | // Calculate half-transformed values at the first pixel (nominators) | ||
182 | // tw = T6*x + T7*y + T8 | ||
183 | // tx = (T0*x + T1*y + T2) / tw | ||
184 | // ty = (T3*x + T4*y + T5) / tw | ||
185 | 16980 | tx0 = svdup_n_f32(fmaf(transform[1], fy, transform[2])); | |
186 | 16980 | ty0 = svdup_n_f32(fmaf(transform[4], fy, transform[5])); | |
187 | 16980 | tw0 = svdup_n_f32(fmaf(transform[7], fy, transform[8])); | |
188 | |||
189 | 16980 | Columns<ScalarType> dst = dst_rows.as_columns(); | |
190 | 16980 | LoopUnroll2 loop{dst_width, kStep}; | |
191 | if constexpr (Inter == KLEIDICV_INTERPOLATION_NEAREST) { | ||
192 | 73798 | loop.unroll_four_times([&](size_t x) { vector_path_nearest_4x(x, dst); }); | |
193 | 6064 | loop.unroll_once( | |
194 | 19640 | [&](size_t x) { vector_path_nearest_tail(x, x + kStep, dst); }); | |
195 | 11078 | loop.remaining([&](size_t x, size_t length) { | |
196 | 5014 | vector_path_nearest_tail(x, length, dst); | |
197 | 5014 | }); | |
198 | } else if constexpr (Inter == KLEIDICV_INTERPOLATION_LINEAR) { | ||
199 | 83556 | loop.unroll_four_times([&](size_t x) { | |
200 | 72640 | ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)]; | |
201 | 72640 | svuint32_t res0 = calculate_linear(pg_all32, x); | |
202 | 72640 | x += kStep; | |
203 | 72640 | svuint32_t res1 = calculate_linear(pg_all32, x); | |
204 | 145280 | svuint16_t result16_0 = svuzp1_u16(svreinterpret_u16_u32(res0), | |
205 | 72640 | svreinterpret_u16_u32(res1)); | |
206 | 72640 | x += kStep; | |
207 | 72640 | res0 = calculate_linear(pg_all32, x); | |
208 | 72640 | x += kStep; | |
209 | 72640 | res1 = calculate_linear(pg_all32, x); | |
210 | 145280 | svuint16_t result16_1 = svuzp1_u16(svreinterpret_u16_u32(res0), | |
211 | 72640 | svreinterpret_u16_u32(res1)); | |
212 | 145280 | svst1_u8(svptrue_b8(), p_dst, | |
213 | 145280 | svuzp1_u8(svreinterpret_u8_u16(result16_0), | |
214 | 72640 | svreinterpret_u8_u16(result16_1))); | |
215 | 72640 | }); | |
216 | 24456 | loop.unroll_once([&](size_t x) { | |
217 | 13540 | ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)]; | |
218 | 13540 | svuint32_t result = calculate_linear(pg_all32, x); | |
219 | 13540 | svst1b_u32(pg_all32, p_dst, result); | |
220 | 13540 | }); | |
221 | 21070 | loop.remaining([&](size_t x, size_t x_max) { | |
222 | 10154 | ScalarType *p_dst = &dst[static_cast<ptrdiff_t>(x)]; | |
223 | 10154 | svbool_t pg32 = svwhilelt_b32_u64(x, x_max); | |
224 | 10154 | svuint32_t result = calculate_linear(pg32, x); | |
225 | 10154 | svst1b_u32(pg32, p_dst, result); | |
226 | 10154 | }); | |
227 | } else { | ||
228 | static_assert(Inter == KLEIDICV_INTERPOLATION_NEAREST || | ||
229 | Inter == KLEIDICV_INTERPOLATION_LINEAR, | ||
230 | ": Unknown interpolation type!"); | ||
231 | } | ||
232 | 16980 | }; | |
233 | |||
234 |
16/32✓ Branch 0 taken 4 times.
✓ Branch 1 taken 84 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 42 times.
✗ Branch 6 not taken.
✗ Branch 7 not taken.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 84 times.
✗ Branch 10 not taken.
✗ Branch 11 not taken.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 42 times.
✗ Branch 14 not taken.
✗ Branch 15 not taken.
✓ Branch 16 taken 566 times.
✓ Branch 17 taken 4918 times.
✗ Branch 18 not taken.
✗ Branch 19 not taken.
✓ Branch 20 taken 222 times.
✓ Branch 21 taken 1020 times.
✗ Branch 22 not taken.
✗ Branch 23 not taken.
✓ Branch 24 taken 560 times.
✓ Branch 25 taken 8156 times.
✗ Branch 26 not taken.
✗ Branch 27 not taken.
✓ Branch 28 taken 218 times.
✓ Branch 29 taken 2634 times.
✗ Branch 30 not taken.
✗ Branch 31 not taken.
|
18558 | for (size_t y = y_begin; y < y_end; ++y) { |
235 | 16980 | process_row(y); | |
236 | 16980 | ++dst_rows; | |
237 | 16980 | } | |
238 | 1578 | } | |
239 | |||
240 | template <typename T> | ||
241 | 1604 | KLEIDICV_LOCALLY_STREAMING kleidicv_error_t warp_perspective_stripe( | |
242 | const T *src, size_t src_stride, size_t src_width, size_t src_height, | ||
243 | T *dst, size_t dst_stride, size_t dst_width, size_t dst_height, | ||
244 | size_t y_begin, size_t y_end, const float transform[9], size_t channels, | ||
245 | kleidicv_interpolation_type_t interpolation, | ||
246 | kleidicv_border_type_t border_type, const T *border_value) { | ||
247 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1602 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1602 times.
|
1604 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
248 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1600 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1600 times.
|
1602 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
249 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1598 times.
|
1600 | CHECK_POINTERS(transform); |
250 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1596 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1594 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1594 times.
|
1598 | CHECK_IMAGE_SIZE(src_width, src_height); |
251 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 1592 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 1590 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 1590 times.
|
1594 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
252 |
4/4✓ Branch 0 taken 446 times.
✓ Branch 1 taken 1144 times.
✓ Branch 2 taken 444 times.
✓ Branch 3 taken 2 times.
|
1590 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
253 | 2 | return KLEIDICV_ERROR_NULL_POINTER; | |
254 | } | ||
255 | |||
256 | // Calculating in float32_t will only be precise until 24 bits, and | ||
257 | // multiplication can only be done with 32x32 bits | ||
258 | // Empty source image is not supported | ||
259 |
4/4✓ Branch 0 taken 1586 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1584 times.
✓ Branch 3 taken 2 times.
|
1588 | if (src_width >= (1ULL << 24) || src_height >= (1ULL << 24) || |
260 |
4/4✓ Branch 0 taken 1582 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1580 times.
✓ Branch 3 taken 2 times.
|
1584 | dst_width >= (1ULL << 24) || dst_height >= (1ULL << 24) || |
261 |
4/6✓ Branch 0 taken 1578 times.
✓ Branch 1 taken 2 times.
✓ Branch 2 taken 1578 times.
✗ Branch 3 not taken.
✗ Branch 4 not taken.
✓ Branch 5 taken 1578 times.
|
1580 | src_stride >= (1ULL << 32) || src_width == 0 || src_height == 0) { |
262 | 10 | return KLEIDICV_ERROR_RANGE; | |
263 | } | ||
264 | |||
265 | 1578 | Rows<const T> src_rows{src, src_stride, channels}; | |
266 | 1578 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
267 | 1578 | Rectangle rect{dst_width, dst_height}; | |
268 | |||
269 | 1578 | dst_rows += y_begin; | |
270 | |||
271 | 1578 | transform_operation<T>(is_image_large(src_rows, src_height), interpolation, | |
272 | border_type, channels, src_rows, src_width, src_height, | ||
273 | border_value, dst_rows, dst_width, y_begin, y_end, | ||
274 | transform); | ||
275 | |||
276 | 1578 | return KLEIDICV_OK; | |
277 | 1604 | } | |
278 | |||
279 | #define KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(type) \ | ||
280 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t \ | ||
281 | warp_perspective_stripe<type>( \ | ||
282 | const type *src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
283 | type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
284 | size_t y_begin, size_t y_end, const float transformation[9], \ | ||
285 | size_t channels, kleidicv_interpolation_type_t interpolation, \ | ||
286 | kleidicv_border_type_t border_type, const type *border_value) | ||
287 | |||
288 | KLEIDICV_INSTANTIATE_WARP_PERSPECTIVE(uint8_t); | ||
289 | |||
290 | } // namespace kleidicv::sve2 | ||
291 |