Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | #include <type_traits> | ||
7 | |||
8 | #include "kleidicv/kleidicv.h" | ||
9 | #include "kleidicv/neon.h" | ||
10 | #include "kleidicv/transform/remap.h" | ||
11 | |||
12 | namespace kleidicv::neon { | ||
13 | |||
14 | template <typename ScalarType> | ||
15 | class RemapS16Replicate { | ||
16 | public: | ||
17 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
18 | using MapVectorType = typename MapVecTraits::VectorType; | ||
19 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
20 | |||
21 | 134 | RemapS16Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
22 | size_t src_height) | ||
23 | 134 | : src_rows_{src_rows}, | |
24 | 268 | v_src_element_stride_{vdupq_n_u16( | |
25 | 134 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
26 | 134 | v_xmax_{vdupq_n_s16(static_cast<int16_t>(src_width - 1))}, | |
27 | 134 | v_ymax_{vdupq_n_s16(static_cast<int16_t>(src_height - 1))} {} | |
28 | |||
29 | 4962 | void transform_pixels(uint32x4_t indices_low, uint32x4_t indices_high, | |
30 | Columns<ScalarType> dst) { | ||
31 | // Copy pixels from source | ||
32 | 4962 | dst[0] = src_rows_[vgetq_lane_u32(indices_low, 0)]; | |
33 | 4962 | dst[1] = src_rows_[vgetq_lane_u32(indices_low, 1)]; | |
34 | 4962 | dst[2] = src_rows_[vgetq_lane_u32(indices_low, 2)]; | |
35 | 4962 | dst[3] = src_rows_[vgetq_lane_u32(indices_low, 3)]; | |
36 | |||
37 | 4962 | dst[4] = src_rows_[vgetq_lane_u32(indices_high, 0)]; | |
38 | 4962 | dst[5] = src_rows_[vgetq_lane_u32(indices_high, 1)]; | |
39 | 4962 | dst[6] = src_rows_[vgetq_lane_u32(indices_high, 2)]; | |
40 | 4962 | dst[7] = src_rows_[vgetq_lane_u32(indices_high, 3)]; | |
41 | 4962 | } | |
42 | |||
43 | 182 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
44 | Columns<ScalarType> dst) { | ||
45 | 5144 | auto vector_path = [&](size_t step) { | |
46 | 4962 | MapVector2Type xy = vld2q_s16(&mapxy[0]); | |
47 | // Clamp coordinates to within the dimensions of the source image | ||
48 | 9924 | uint16x8_t x = vreinterpretq_u16_s16( | |
49 | 4962 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[0], v_xmax_))); | |
50 | 9924 | uint16x8_t y = vreinterpretq_u16_s16( | |
51 | 4962 | vmaxq_s16(vdupq_n_s16(0), vminq_s16(xy.val[1], v_ymax_))); | |
52 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
53 | 9924 | uint32x4_t indices_low = | |
54 | 9924 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
55 | 4962 | vget_low_u16(v_src_element_stride_)); | |
56 | 9924 | uint32x4_t indices_high = | |
57 | 4962 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
58 | |||
59 | 4962 | transform_pixels(indices_low, indices_high, dst); | |
60 | |||
61 | 4962 | mapxy += ptrdiff_t(step); | |
62 | 4962 | dst += ptrdiff_t(step); | |
63 | 4962 | }; | |
64 | |||
65 | 182 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
66 | 182 | loop.unroll_once(vector_path); | |
67 | 364 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
68 | 182 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
69 | 182 | mapxy -= back_step; | |
70 | 182 | dst -= back_step; | |
71 | 280 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
72 | 182 | } | |
73 | |||
74 | private: | ||
75 | Rows<const ScalarType> src_rows_; | ||
76 | uint16x8_t v_src_element_stride_; | ||
77 | int16x8_t v_xmax_; | ||
78 | int16x8_t v_ymax_; | ||
79 | }; // end of class RemapS16Replicate<ScalarType> | ||
80 | |||
81 | template <typename ScalarType> | ||
82 | class RemapS16ConstantBorder { | ||
83 | public: | ||
84 | using SrcVecTraits = neon::VecTraits<ScalarType>; | ||
85 | using SrcVecType = typename SrcVecTraits::VectorType; | ||
86 | |||
87 | using MapVecTraits = neon::VecTraits<int16_t>; | ||
88 | using MapVectorType = typename MapVecTraits::VectorType; | ||
89 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
90 | |||
91 | 132 | RemapS16ConstantBorder(Rows<const ScalarType> src_rows, size_t src_width, | |
92 | size_t src_height, const ScalarType *border_value) | ||
93 | 132 | : src_rows_{src_rows}, | |
94 | 264 | v_src_element_stride_{vdupq_n_u16( | |
95 | 132 | static_cast<uint16_t>(src_rows_.stride() / sizeof(ScalarType)))}, | |
96 | 132 | v_width_{vdupq_n_u16(static_cast<uint16_t>(src_width))}, | |
97 | 132 | v_height_{vdupq_n_u16(static_cast<uint16_t>(src_height))}, | |
98 | 132 | v_border_{vdupq_n_u16(*border_value)} {} | |
99 | |||
100 | void transform_pixels(uint32x4_t indices_low, uint32x4_t indices_high, | ||
101 | uint16x8_t in_range, Columns<ScalarType> dst); | ||
102 | |||
103 | 180 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
104 | Columns<ScalarType> dst) { | ||
105 | 5140 | auto vector_path = [&](size_t step) { | |
106 | 4960 | MapVector2Type xy = vld2q_s16(&mapxy[0]); | |
107 | |||
108 | 4960 | uint16x8_t x = vreinterpretq_u16_s16(xy.val[0]); | |
109 | 4960 | uint16x8_t y = vreinterpretq_u16_s16(xy.val[1]); | |
110 | |||
111 | // Find whether coordinates are within the image dimensions. | ||
112 | // Negative coordinates are interpreted as large values due to the | ||
113 | // s16->u16 reinterpretation. | ||
114 | 9920 | uint16x8_t in_range = | |
115 | 4960 | vandq_u16(vcltq_u16(x, v_width_), vcltq_u16(y, v_height_)); | |
116 | |||
117 | // Zero out-of-range coordinates. | ||
118 | 4960 | x = vandq_u16(in_range, x); | |
119 | 4960 | y = vandq_u16(in_range, y); | |
120 | |||
121 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
122 | 9920 | uint32x4_t indices_low = | |
123 | 9920 | vmlal_u16(vmovl_u16(vget_low_u16(x)), vget_low_u16(y), | |
124 | 4960 | vget_low_u16(v_src_element_stride_)); | |
125 | 9920 | uint32x4_t indices_high = | |
126 | 4960 | vmlal_high_u16(vmovl_high_u16(x), y, v_src_element_stride_); | |
127 | |||
128 | 4960 | transform_pixels(indices_low, indices_high, in_range, dst); | |
129 | |||
130 | 4960 | mapxy += ptrdiff_t(step); | |
131 | 4960 | dst += ptrdiff_t(step); | |
132 | 4960 | }; | |
133 | |||
134 | 180 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
135 | 180 | loop.unroll_once(vector_path); | |
136 | 360 | ptrdiff_t back_step = static_cast<ptrdiff_t>(loop.step()) - | |
137 | 180 | static_cast<ptrdiff_t>(loop.remaining_length()); | |
138 | 180 | mapxy -= back_step; | |
139 | 180 | dst -= back_step; | |
140 | 278 | loop.remaining([&](size_t, size_t step) { vector_path(step); }); | |
141 | 180 | } | |
142 | |||
143 | private: | ||
144 | Rows<const ScalarType> src_rows_; | ||
145 | uint16x8_t v_src_element_stride_; | ||
146 | uint16x8_t v_width_; | ||
147 | uint16x8_t v_height_; | ||
148 | uint16x8_t v_border_; | ||
149 | }; // end of class RemapS16ConstantBorder<ScalarType> | ||
150 | |||
151 | template <> | ||
152 | 2528 | void RemapS16ConstantBorder<uint8_t>::transform_pixels(uint32x4_t indices_low, | |
153 | uint32x4_t indices_high, | ||
154 | uint16x8_t in_range, | ||
155 | Columns<uint8_t> dst) { | ||
156 | 22752 | uint8x8_t pixels = { | |
157 | 2528 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
158 | 2528 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
159 | 2528 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
160 | 2528 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
161 | 2528 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
162 | 2528 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
163 | 2528 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
164 | 2528 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
165 | }; | ||
166 | // Select between source pixels and border colour | ||
167 | 5056 | uint8x8_t pixels_or_border = | |
168 | 2528 | vbsl_u8(vmovn_u16(in_range), pixels, vmovn_u16(v_border_)); | |
169 | |||
170 | 2528 | vst1_u8(&dst[0], pixels_or_border); | |
171 | 2528 | } | |
172 | |||
173 | template <> | ||
174 | 2432 | void RemapS16ConstantBorder<uint16_t>::transform_pixels(uint32x4_t indices_low, | |
175 | uint32x4_t indices_high, | ||
176 | uint16x8_t in_range, | ||
177 | Columns<uint16_t> dst) { | ||
178 | 21888 | uint16x8_t pixels = { | |
179 | 2432 | src_rows_[vgetq_lane_u32(indices_low, 0)], | |
180 | 2432 | src_rows_[vgetq_lane_u32(indices_low, 1)], | |
181 | 2432 | src_rows_[vgetq_lane_u32(indices_low, 2)], | |
182 | 2432 | src_rows_[vgetq_lane_u32(indices_low, 3)], | |
183 | 2432 | src_rows_[vgetq_lane_u32(indices_high, 0)], | |
184 | 2432 | src_rows_[vgetq_lane_u32(indices_high, 1)], | |
185 | 2432 | src_rows_[vgetq_lane_u32(indices_high, 2)], | |
186 | 2432 | src_rows_[vgetq_lane_u32(indices_high, 3)], | |
187 | }; | ||
188 | |||
189 | // Select between source pixels and border colour | ||
190 | 2432 | uint16x8_t pixels_or_border = vbslq_u16(in_range, pixels, v_border_); | |
191 | |||
192 | 2432 | vst1q_u16(&dst[0], pixels_or_border); | |
193 | 2432 | } | |
194 | |||
195 | // Most of the complexity comes from parameter checking. | ||
196 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
197 | template <typename T> | ||
198 | 300 | kleidicv_error_t remap_s16(const T *src, size_t src_stride, size_t src_width, | |
199 | size_t src_height, T *dst, size_t dst_stride, | ||
200 | size_t dst_width, size_t dst_height, size_t channels, | ||
201 | const int16_t *mapxy, size_t mapxy_stride, | ||
202 | kleidicv_border_type_t border_type, | ||
203 | [[maybe_unused]] const T *border_value) { | ||
204 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 149 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 149 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 149 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 149 times.
|
300 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
205 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 148 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 148 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 148 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 148 times.
|
298 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
206 |
8/8✓ Branch 0 taken 1 times.
✓ Branch 1 taken 147 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 147 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 147 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 147 times.
|
296 | CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); |
207 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 146 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 144 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 144 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 146 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 144 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 144 times.
|
294 | CHECK_IMAGE_SIZE(src_width, src_height); |
208 |
12/12✓ Branch 0 taken 1 times.
✓ Branch 1 taken 143 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 142 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 142 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 143 times.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 142 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 142 times.
|
288 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
209 |
8/8✓ Branch 0 taken 67 times.
✓ Branch 1 taken 75 times.
✓ Branch 2 taken 66 times.
✓ Branch 3 taken 1 times.
✓ Branch 4 taken 67 times.
✓ Branch 5 taken 75 times.
✓ Branch 6 taken 66 times.
✓ Branch 7 taken 1 times.
|
284 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
210 | 2 | return KLEIDICV_ERROR_NULL_POINTER; | |
211 | } | ||
212 | |||
213 |
8/8✓ Branch 0 taken 133 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 133 times.
✓ Branch 3 taken 8 times.
✓ Branch 4 taken 133 times.
✓ Branch 5 taken 8 times.
✓ Branch 6 taken 133 times.
✓ Branch 7 taken 8 times.
|
564 | if (!remap_s16_is_implemented<T>(src_stride, src_width, src_height, dst_width, |
214 | 282 | border_type, channels)) { | |
215 | 16 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
216 | } | ||
217 | |||
218 | 266 | Rows<const T> src_rows{src, src_stride, channels}; | |
219 | 266 | Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2}; | |
220 | 266 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
221 | 266 | Rectangle rect{dst_width, dst_height}; | |
222 |
4/4✓ Branch 0 taken 67 times.
✓ Branch 1 taken 66 times.
✓ Branch 2 taken 67 times.
✓ Branch 3 taken 66 times.
|
266 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { |
223 | 264 | RemapS16ConstantBorder<T> operation{src_rows, src_width, src_height, | |
224 | 132 | border_value}; | |
225 | 132 | zip_rows(operation, rect, mapxy_rows, dst_rows); | |
226 | 132 | } else { | |
227 | assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); | ||
228 | 134 | RemapS16Replicate<T> operation{src_rows, src_width, src_height}; | |
229 | 134 | zip_rows(operation, rect, mapxy_rows, dst_rows); | |
230 | 134 | } | |
231 | 266 | return KLEIDICV_OK; | |
232 | 300 | } | |
233 | // NOLINTEND(readability-function-cognitive-complexity) | ||
234 | |||
235 | #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(type) \ | ||
236 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16<type>( \ | ||
237 | const type *src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
238 | type *dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
239 | size_t channels, const int16_t *mapxy, size_t mapxy_stride, \ | ||
240 | kleidicv_border_type_t border_type, const type *border_value) | ||
241 | |||
242 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint8_t); | ||
243 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16(uint16_t); | ||
244 | |||
245 | } // namespace kleidicv::neon | ||
246 |