KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/transform/remap_s16_sve2.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 146 146 100.0%
Functions: 24 24 100.0%
Branches: 72 72 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cmath>
7 #include <cstddef>
8 #include <cstdint>
9
10 #include "kleidicv/sve2.h"
11 #include "kleidicv/transform/remap.h"
12 #include "transform_sve2.h"
13
14 namespace kleidicv::sve2 {
15
16 template <typename ScalarType>
17 class RemapS16Replicate {
18 public:
19 using MapVecTraits = VecTraits<int16_t>;
20 using MapVectorType = typename MapVecTraits::VectorType;
21 using MapVector2Type = typename MapVecTraits::Vector2Type;
22
23 268 RemapS16Replicate(Rows<const ScalarType> src_rows, size_t src_width,
24 size_t src_height, svuint16_t& v_src_element_stride,
25 MapVectorType& v_x_max, MapVectorType& v_y_max)
26 268 : src_rows_{src_rows},
27 268 v_src_element_stride_{v_src_element_stride},
28 268 v_xmax_{v_x_max},
29 268 v_ymax_{v_y_max} {
30 268 v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
31 268 v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1));
32 268 v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1));
33 268 }
34
35 void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
36 svuint32_t offsets_t, svbool_t pg_t,
37 Columns<ScalarType> dst);
38
39 364 void process_row(size_t width, Columns<const int16_t> mapxy,
40 Columns<ScalarType> dst) {
41 364 svuint32_t offsets_b, offsets_t;
42 364 svint16_t svzero = svdup_n_s16(0);
43 11152 auto load_offsets = [&](svbool_t pg) {
44 10788 MapVector2Type xy = svld2_s16(pg, &mapxy[0]);
45 // Clamp coordinates to within the dimensions of the source image
46 21576 svuint16_t x = svreinterpret_u16_s16(
47 10788 svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 0), v_xmax_)));
48 21576 svuint16_t y = svreinterpret_u16_s16(
49 10788 svmax_x(pg, svzero, svmin_x(pg, svget2(xy, 1), v_ymax_)));
50 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
51 10788 offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
52 10788 offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
53 10788 };
54
55 364 svbool_t pg_all16 = MapVecTraits::svptrue();
56 364 svbool_t pg_all32 = svptrue_b32();
57
58 560 auto gather_load_generic_vector_path = [&](svbool_t pg, ptrdiff_t step) {
59 196 load_offsets(pg);
60 196 svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2);
61 196 svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2);
62 196 transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t, dst);
63 196 mapxy += step;
64 196 dst += step;
65 196 };
66
67 // NOTE: gather load is not available in streaming mode
68 10956 auto gather_load_full_vector_path = [&](ptrdiff_t step) {
69 10592 load_offsets(pg_all16);
70 10592 transform_pixels(pg_all16, offsets_b, pg_all32, offsets_t, pg_all32, dst);
71 10592 mapxy += step;
72 10592 dst += step;
73 10592 };
74
75 364 LoopUnroll loop{width, MapVecTraits::num_lanes()};
76 10956 loop.unroll_once([&](size_t step) {
77 10592 gather_load_full_vector_path(static_cast<ptrdiff_t>(step));
78 10592 });
79 560 loop.remaining([&](size_t length, size_t step) {
80 196 svbool_t pg = MapVecTraits::svwhilelt(step - length, step);
81 196 gather_load_generic_vector_path(pg, static_cast<ptrdiff_t>(length));
82 196 });
83 364 }
84
85 private:
86 Rows<const ScalarType> src_rows_;
87 svuint16_t& v_src_element_stride_;
88 MapVectorType& v_xmax_;
89 MapVectorType& v_ymax_;
90 }; // end of class RemapS16Replicate<ScalarType>
91
92 template <>
93 5634 void RemapS16Replicate<uint8_t>::transform_pixels(
94 svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
95 svbool_t pg_t, Columns<uint8_t> dst) {
96 // Copy pixels from source
97 11268 svuint32_t result_b =
98 5634 svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
99 11268 svuint32_t result_t =
100 5634 svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
101 11268 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
102 5634 svreinterpret_u16_u32(result_t));
103
104 5634 svst1b_u16(pg, &dst[0], result);
105 5634 }
106
107 template <>
108 5154 void RemapS16Replicate<uint16_t>::transform_pixels(
109 svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
110 svbool_t pg_t, Columns<uint16_t> dst) {
111 // Account for the size of the source type when calculating offset
112 5154 offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
113 5154 offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
114
115 // Copy pixels from source
116 10308 svuint32_t result_b =
117 5154 svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
118 10308 svuint32_t result_t =
119 5154 svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
120 10308 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
121 5154 svreinterpret_u16_u32(result_t));
122
123 5154 svst1_u16(pg, &dst[0], result);
124 5154 }
125
126 template <typename ScalarType>
127 class RemapS16ConstantBorder {
128 public:
129 264 RemapS16ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width,
130 size_t src_height, const ScalarType* border_value,
131 svuint16_t& v_src_element_stride, svuint16_t& v_width,
132 svuint16_t& v_height, svuint16_t& v_border)
133 264 : src_rows_{src_rows},
134 264 v_src_element_stride_{v_src_element_stride},
135 264 v_width_{v_width},
136 264 v_height_{v_height},
137 264 v_border_{v_border} {
138 264 v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType));
139 264 v_width_ = svdup_u16(static_cast<uint16_t>(src_width));
140 264 v_height_ = svdup_u16(static_cast<uint16_t>(src_height));
141 264 v_border_ = svdup_u16(*border_value);
142 264 }
143
144 void transform_pixels(svbool_t pg, svuint32_t offsets_b, svbool_t pg_b,
145 svuint32_t offsets_t, svbool_t pg_t, ScalarType* dst);
146
147 360 void process_row(size_t width, Columns<const int16_t> mapxy,
148 Columns<ScalarType> dst) {
149
4/4
✓ Branch 0 taken 180 times.
✓ Branch 1 taken 5632 times.
✓ Branch 2 taken 180 times.
✓ Branch 3 taken 5152 times.
11144 for (size_t i = 0; i < width; i += svcnth()) {
150 10784 svbool_t pg = svwhilelt_b16_u64(i, width);
151
152 10784 svint16x2_t xy = svld2_s16(pg, &mapxy[static_cast<ptrdiff_t>(i * 2)]);
153 10784 svuint16_t x = svreinterpret_u16_s16(svget2(xy, 0));
154 10784 svuint16_t y = svreinterpret_u16_s16(svget2(xy, 1));
155
156 // Find whether coordinates are within the image dimensions.
157 21568 svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_),
158 10784 svcmplt_u16(pg, y, v_height_));
159 10784 svbool_t pg_b = in_range;
160 10784 svbool_t pg_t = svtrn2_b16(in_range, svpfalse());
161
162 // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x)
163 21568 svuint32_t offsets_b =
164 10784 svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_);
165 21568 svuint32_t offsets_t =
166 10784 svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_);
167
168 21568 transform_pixels(pg, offsets_b, pg_b, offsets_t, pg_t,
169 10784 &dst[static_cast<ptrdiff_t>(i)]);
170 10784 }
171 360 }
172
173 private:
174 Rows<const ScalarType> src_rows_;
175 svuint16_t& v_src_element_stride_;
176 svuint16_t& v_width_;
177 svuint16_t& v_height_;
178 svuint16_t& v_border_;
179 }; // end of class RemapS16ConstantBorder<ScalarType>
180
181 template <>
182 5632 void RemapS16ConstantBorder<uint8_t>::transform_pixels(
183 svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
184 svbool_t pg_t, uint8_t* dst) {
185 // Copy pixels from source
186 11264 svuint32_t result_b =
187 5632 svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
188 11264 svuint32_t result_t =
189 5632 svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
190
191 11264 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
192 5632 svreinterpret_u16_u32(result_t));
193
194 5632 svuint16_t result_selected = svsel(pg_b, result, v_border_);
195 5632 svst1b_u16(pg, dst, result_selected);
196 5632 }
197
198 template <>
199 5152 void RemapS16ConstantBorder<uint16_t>::transform_pixels(
200 svbool_t pg, svuint32_t offsets_b, svbool_t pg_b, svuint32_t offsets_t,
201 svbool_t pg_t, uint16_t* dst) {
202 // Account for the size of the source type when calculating offset
203 5152 offsets_b = svlsl_n_u32_x(pg, offsets_b, 1);
204 5152 offsets_t = svlsl_n_u32_x(pg, offsets_t, 1);
205
206 // Copy pixels from source
207 10304 svuint32_t result_b =
208 5152 svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b);
209 10304 svuint32_t result_t =
210 5152 svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t);
211
212 10304 svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b),
213 5152 svreinterpret_u16_u32(result_t));
214
215 5152 svuint16_t result_selected = svsel(pg_b, result, v_border_);
216 5152 svst1_u16(pg, dst, result_selected);
217 5152 }
218
219 // Most of the complexity comes from parameter checking.
220 // NOLINTBEGIN(readability-function-cognitive-complexity)
221 template <typename T>
222 600 kleidicv_error_t remap_s16(const T* src, size_t src_stride, size_t src_width,
223 size_t src_height, T* dst, size_t dst_stride,
224 size_t dst_width, size_t dst_height, size_t channels,
225 const int16_t* mapxy, size_t mapxy_stride,
226 kleidicv_border_type_t border_type,
227 const T* border_value) {
228
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 298 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 298 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 298 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 298 times.
600 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
229
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 296 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 296 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 296 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 296 times.
596 CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
230
8/8
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 294 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 294 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 294 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 294 times.
592 CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height);
231
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 292 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 288 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 288 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 292 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 288 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 288 times.
588 CHECK_IMAGE_SIZE(src_width, src_height);
232
12/12
✓ Branch 0 taken 2 times.
✓ Branch 1 taken 286 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 284 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 286 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 284 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 284 times.
576 CHECK_IMAGE_SIZE(dst_width, dst_height);
233
8/8
✓ Branch 0 taken 134 times.
✓ Branch 1 taken 150 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 134 times.
✓ Branch 5 taken 150 times.
✓ Branch 6 taken 132 times.
✓ Branch 7 taken 2 times.
568 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) {
234 4 return KLEIDICV_ERROR_NULL_POINTER;
235 }
236
237
8/8
✓ Branch 0 taken 266 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 16 times.
✓ Branch 4 taken 266 times.
✓ Branch 5 taken 16 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 16 times.
1128 if (!remap_s16_is_implemented<T>(src_stride, src_width, src_height, dst_width,
238 564 border_type, channels)) {
239 32 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
240 }
241
242 532 Rows<const T> src_rows{src, src_stride, channels};
243 532 Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2};
244 532 Rows<T> dst_rows{dst, dst_stride, channels};
245 532 svuint16_t sv_src_element_stride;
246 532 Rectangle rect{dst_width, dst_height};
247
4/4
✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
532 if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) {
248 264 svuint16_t sv_width, sv_height, sv_border;
249 528 RemapS16ConstantBorder<T> operation{
250 264 src_rows, src_width, src_height, border_value, sv_src_element_stride,
251 sv_width, sv_height, sv_border};
252 264 zip_rows(operation, rect, mapxy_rows, dst_rows);
253 264 } else {
254 assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE);
255 268 svint16_t sv_xmax, sv_ymax;
256 536 RemapS16Replicate<T> operation{src_rows, src_width,
257 268 src_height, sv_src_element_stride,
258 sv_xmax, sv_ymax};
259 268 zip_rows(operation, rect, mapxy_rows, dst_rows);
260 268 }
261 532 return KLEIDICV_OK;
262 600 }
263 // NOLINTEND(readability-function-cognitive-complexity)
264
265 #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \
266 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>( \
267 const type* src, size_t src_stride, size_t src_width, size_t src_height, \
268 type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \
269 size_t channels, const int16_t* mapxy, size_t mapxy_stride, \
270 kleidicv_border_type_t border_type, const type* border_value)
271
272 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t);
273 KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t);
274
275 } // namespace kleidicv::sve2
276