Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include "kleidicv/ctypes.h" | ||
6 | #include "kleidicv/filters/blur_and_downsample.h" | ||
7 | #include "kleidicv/kleidicv.h" | ||
8 | #include "kleidicv/sve2.h" | ||
9 | #include "kleidicv/utils.h" | ||
10 | #include "kleidicv/workspace/blur_and_downsample_ws.h" | ||
11 | #include "kleidicv/workspace/border_5x5.h" | ||
12 | |||
13 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
14 | |||
15 | // Applies Gaussian Blur binomial filter to even rows and columns | ||
16 | // | ||
17 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
18 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
19 | // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] | ||
20 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
21 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
22 | class BlurAndDownsample { | ||
23 | public: | ||
24 | using SourceType = uint8_t; | ||
25 | using BufferType = uint16_t; | ||
26 | using DestinationType = uint8_t; | ||
27 | using SourceVecTraits = | ||
28 | typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>; | ||
29 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
30 | using SourceVector2Type = typename SourceVecTraits::Vector2Type; | ||
31 | using BufferVecTraits = | ||
32 | typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<BufferType>; | ||
33 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
34 | using BorderInfoType = | ||
35 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>; | ||
36 | using BorderType = FixedBorderType; | ||
37 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
38 | |||
39 | static constexpr size_t margin = 2UL; | ||
40 | |||
41 | 918 | void process_vertical(size_t width, Rows<const SourceType> src_rows, | |
42 | Rows<BufferType> dst_rows, | ||
43 | BorderOffsets border_offsets) const KLEIDICV_STREAMING { | ||
44 | 918 | LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; | |
45 | |||
46 | 1040 | loop.unroll_twice([&](ptrdiff_t index) KLEIDICV_STREAMING { | |
47 | 122 | svbool_t pg_all = SourceVecTraits::svptrue(); | |
48 | 244 | vertical_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, | |
49 | 122 | index); | |
50 | 122 | }); | |
51 | |||
52 | 934 | loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING { | |
53 | 16 | svbool_t pg_all = SourceVecTraits::svptrue(); | |
54 | 32 | vertical_vector_path_1x(pg_all, src_rows, dst_rows, border_offsets, | |
55 | 16 | index); | |
56 | 16 | }); | |
57 | |||
58 | 1820 | loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING { | |
59 | 902 | svbool_t pg = SourceVecTraits::svwhilelt(index, length); | |
60 | 902 | vertical_vector_path_1x(pg, src_rows, dst_rows, border_offsets, index); | |
61 | 902 | }); | |
62 | 918 | } | |
63 | |||
64 | 918 | void process_horizontal(size_t width, Rows<const BufferType> src_rows, | |
65 | Rows<DestinationType> dst_rows, | ||
66 | BorderOffsets border_offsets) const | ||
67 | KLEIDICV_STREAMING { | ||
68 | 918 | svbool_t pg_all = BufferVecTraits::svptrue(); | |
69 | 918 | LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; | |
70 | |||
71 | 1098 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
72 | 360 | horizontal_vector_path_2x(pg_all, pg_all, src_rows, pg_all, dst_rows, | |
73 | 180 | border_offsets, static_cast<ptrdiff_t>(index)); | |
74 | 180 | }); | |
75 | |||
76 | 1344 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
77 | 426 | svbool_t pg_src_0 = BufferVecTraits::svwhilelt(index, length); | |
78 | 852 | svbool_t pg_src_1 = BufferVecTraits::svwhilelt( | |
79 | 426 | index + BufferVecTraits::num_lanes(), length); | |
80 | 852 | svbool_t pg_dst = | |
81 | 426 | BufferVecTraits::svwhilelt((index + 1) / 2, (length + 1) / 2); | |
82 | 852 | horizontal_vector_path_2x(pg_src_0, pg_src_1, src_rows, pg_dst, dst_rows, | |
83 | 426 | border_offsets, static_cast<ptrdiff_t>(index)); | |
84 | 426 | }); | |
85 | 918 | } | |
86 | |||
87 | 1836 | void process_horizontal_borders( | |
88 | Rows<const BufferType> src_rows, Rows<DestinationType> dst_rows, | ||
89 | BorderOffsets border_offsets) const KLEIDICV_STREAMING { | ||
90 |
2/2✓ Branch 0 taken 1836 times.
✓ Branch 1 taken 1836 times.
|
3672 | for (ptrdiff_t index = 0; |
91 | 3672 | index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) { | |
92 | 1836 | disable_loop_vectorization(); | |
93 | 1836 | svbool_t pg = svptrue_pat_b8(SV_VL1); | |
94 | 1836 | horizontal_border_path(pg, src_rows, dst_rows, border_offsets, index); | |
95 | 1836 | } | |
96 | 1836 | } | |
97 | |||
98 | private: | ||
99 | 122 | void vertical_vector_path_2x(svbool_t pg, Rows<const SourceType> src_rows, | |
100 | Rows<BufferType> dst_rows, | ||
101 | BorderOffsets border_offsets, | ||
102 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
103 | 122 | const auto *src_row_0 = &src_rows.at(border_offsets.c0())[index]; | |
104 | 122 | const auto *src_row_1 = &src_rows.at(border_offsets.c1())[index]; | |
105 | 122 | const auto *src_row_2 = &src_rows.at(border_offsets.c2())[index]; | |
106 | 122 | const auto *src_row_3 = &src_rows.at(border_offsets.c3())[index]; | |
107 | 122 | const auto *src_row_4 = &src_rows.at(border_offsets.c4())[index]; | |
108 | |||
109 | 122 | SourceVector2Type src_0; | |
110 | 122 | SourceVector2Type src_1; | |
111 | 122 | SourceVector2Type src_2; | |
112 | 122 | SourceVector2Type src_3; | |
113 | 122 | SourceVector2Type src_4; | |
114 | |||
115 | 122 | src_0 = | |
116 | 122 | svcreate2(svld1(pg, &src_row_0[0]), svld1_vnum(pg, &src_row_0[0], 1)); | |
117 | 122 | src_1 = | |
118 | 122 | svcreate2(svld1(pg, &src_row_1[0]), svld1_vnum(pg, &src_row_1[0], 1)); | |
119 | 122 | src_2 = | |
120 | 122 | svcreate2(svld1(pg, &src_row_2[0]), svld1_vnum(pg, &src_row_2[0], 1)); | |
121 | 122 | src_3 = | |
122 | 122 | svcreate2(svld1(pg, &src_row_3[0]), svld1_vnum(pg, &src_row_3[0], 1)); | |
123 | 122 | src_4 = | |
124 | 122 | svcreate2(svld1(pg, &src_row_4[0]), svld1_vnum(pg, &src_row_4[0], 1)); | |
125 | |||
126 | 244 | vertical_vector_path(pg, svget2(src_0, 0), svget2(src_1, 0), | |
127 | 122 | svget2(src_2, 0), svget2(src_3, 0), svget2(src_4, 0), | |
128 | 122 | &dst_rows[index]); | |
129 | 244 | vertical_vector_path(pg, svget2(src_0, 1), svget2(src_1, 1), | |
130 | 122 | svget2(src_2, 1), svget2(src_3, 1), svget2(src_4, 1), | |
131 | 244 | &dst_rows[index + static_cast<ptrdiff_t>( | |
132 | 122 | SourceVecTraits::num_lanes())]); | |
133 | 122 | } | |
134 | |||
135 | 918 | void vertical_vector_path_1x(svbool_t pg, Rows<const SourceType> src_rows, | |
136 | Rows<BufferType> dst_rows, | ||
137 | BorderOffsets border_offsets, | ||
138 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
139 | 1836 | SourceVectorType src_0 = | |
140 | 918 | svld1(pg, &src_rows.at(border_offsets.c0())[index]); | |
141 | 1836 | SourceVectorType src_1 = | |
142 | 918 | svld1(pg, &src_rows.at(border_offsets.c1())[index]); | |
143 | 1836 | SourceVectorType src_2 = | |
144 | 918 | svld1(pg, &src_rows.at(border_offsets.c2())[index]); | |
145 | 1836 | SourceVectorType src_3 = | |
146 | 918 | svld1(pg, &src_rows.at(border_offsets.c3())[index]); | |
147 | 1836 | SourceVectorType src_4 = | |
148 | 918 | svld1(pg, &src_rows.at(border_offsets.c4())[index]); | |
149 | 1836 | vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, | |
150 | 918 | &dst_rows[index]); | |
151 | 918 | } | |
152 | |||
153 | // Applies vertical filtering vector using SIMD operations. | ||
154 | // | ||
155 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
156 | 1162 | void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, | |
157 | svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, | ||
158 | BufferType *dst) const KLEIDICV_STREAMING { | ||
159 | 1162 | svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4); | |
160 | 1162 | svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4); | |
161 | 1162 | svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3); | |
162 | 1162 | svuint16_t acc_1_3_t = svaddlt_u16(src_1, src_3); | |
163 | |||
164 | 1162 | svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src_2, 6); | |
165 | 1162 | svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src_2, 6); | |
166 | 1162 | acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4); | |
167 | 1162 | acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4); | |
168 | |||
169 | 1162 | svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t); | |
170 | 1162 | svst2(pg, &dst[0], interleaved); | |
171 | 1162 | } | |
172 | |||
173 | 606 | void horizontal_vector_path_2x(svbool_t pg_src_0, svbool_t pg_src_1, | |
174 | Rows<const BufferType> src_rows, | ||
175 | svbool_t pg_dst, | ||
176 | Rows<DestinationType> dst_rows, | ||
177 | BorderOffsets border_offsets, | ||
178 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
179 | 606 | const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index]; | |
180 | 606 | const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index]; | |
181 | 606 | const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index]; | |
182 | 606 | const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index]; | |
183 | 606 | const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index]; | |
184 | |||
185 | 606 | BufferVectorType src_0_0 = svld1(pg_src_0, &src_0[0]); | |
186 | 606 | BufferVectorType src_1_0 = svld1_vnum(pg_src_1, &src_0[0], 1); | |
187 | 606 | BufferVectorType src_0_1 = svld1(pg_src_0, &src_1[0]); | |
188 | 606 | BufferVectorType src_1_1 = svld1_vnum(pg_src_1, &src_1[0], 1); | |
189 | 606 | BufferVectorType src_0_2 = svld1(pg_src_0, &src_2[0]); | |
190 | 606 | BufferVectorType src_1_2 = svld1_vnum(pg_src_1, &src_2[0], 1); | |
191 | 606 | BufferVectorType src_0_3 = svld1(pg_src_0, &src_3[0]); | |
192 | 606 | BufferVectorType src_1_3 = svld1_vnum(pg_src_1, &src_3[0], 1); | |
193 | 606 | BufferVectorType src_0_4 = svld1(pg_src_0, &src_4[0]); | |
194 | 606 | BufferVectorType src_1_4 = svld1_vnum(pg_src_1, &src_4[0], 1); | |
195 | |||
196 | 1212 | svuint16_t res_0 = horizontal_vector_path(pg_src_0, src_0_0, src_0_1, | |
197 | 606 | src_0_2, src_0_3, src_0_4); | |
198 | 1212 | svuint16_t res_1 = horizontal_vector_path(pg_src_1, src_1_0, src_1_1, | |
199 | 606 | src_1_2, src_1_3, src_1_4); | |
200 | |||
201 | 606 | svuint16_t res_even_only = svuzp1(res_0, res_1); | |
202 | 606 | svst1b(pg_dst, &dst_rows[index / 2], res_even_only); | |
203 | 606 | } | |
204 | |||
205 | // Applies horizontal filtering vector using SIMD operations. | ||
206 | // | ||
207 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
208 | 1212 | svuint16_t horizontal_vector_path(svbool_t pg, svuint16_t src_0, | |
209 | svuint16_t src_1, svuint16_t src_2, | ||
210 | svuint16_t src_3, | ||
211 | svuint16_t src_4) const KLEIDICV_STREAMING { | ||
212 | 1212 | svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); | |
213 | 1212 | svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); | |
214 | 1212 | svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); | |
215 | 1212 | acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); | |
216 | 1212 | acc = svrshr_x(pg, acc, 8); | |
217 | 2424 | return acc; | |
218 | 1212 | } | |
219 | |||
220 | // Applies horizontal filtering for the borders using SIMD operations. | ||
221 | // | ||
222 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
223 | 1836 | void horizontal_border_path(svbool_t pg, Rows<const BufferType> src_rows, | |
224 | Rows<DestinationType> dst_rows, | ||
225 | BorderOffsets border_offsets, | ||
226 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
227 | 3672 | BufferVectorType src_0 = | |
228 | 1836 | svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); | |
229 | 3672 | BufferVectorType src_1 = | |
230 | 1836 | svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); | |
231 | 3672 | BufferVectorType src_2 = | |
232 | 1836 | svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); | |
233 | 3672 | BufferVectorType src_3 = | |
234 | 1836 | svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); | |
235 | 3672 | BufferVectorType src_4 = | |
236 | 1836 | svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); | |
237 | |||
238 | 1836 | svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); | |
239 | 1836 | svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); | |
240 | 1836 | svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); | |
241 | 1836 | acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); | |
242 | 1836 | acc = svrshr_x(pg, acc, 8); | |
243 | |||
244 | 1836 | svst1b(pg, &dst_rows[index / 2], acc); | |
245 | 1836 | } | |
246 | }; // end of class BlurAndDownsample | ||
247 | |||
248 | // Does not include checks for whether the operation is implemented. | ||
249 | // This must be done earlier, by blur_and_downsample_is_implemented. | ||
250 | 102 | static kleidicv_error_t blur_and_downsample_checks( | |
251 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
252 | uint8_t *dst, size_t dst_stride, size_t channels, | ||
253 | BlurAndDownsampleFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
254 |
2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 100 times.
|
102 | CHECK_POINTERS(workspace); |
255 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 98 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 98 times.
|
100 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
256 |
4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 96 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 96 times.
|
98 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2); |
257 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 94 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 92 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 92 times.
|
96 | CHECK_IMAGE_SIZE(src_width, src_height); |
258 | |||
259 | 92 | Rectangle rect{src_width, src_height}; | |
260 | 92 | const Rectangle &context_rect = workspace->image_size(); | |
261 |
4/4✓ Branch 0 taken 88 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 86 times.
|
92 | if (context_rect.width() < src_width || context_rect.height() < src_height) { |
262 | 6 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
263 | } | ||
264 | |||
265 | // Currently supports only one channel, so it cannot be tested. | ||
266 | // GCOVR_EXCL_START | ||
267 | − | if (workspace->channels() < channels) { | |
268 | − | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
269 | } | ||
270 | // GCOVR_EXCL_STOP | ||
271 | |||
272 | 86 | return KLEIDICV_OK; | |
273 | 102 | } | |
274 | |||
275 | 102 | static kleidicv_error_t blur_and_downsample_stripe_u8_sc( | |
276 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
277 | uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end, | ||
278 | size_t channels, FixedBorderType fixed_border_type, | ||
279 | kleidicv_filter_context_t *context) KLEIDICV_STREAMING { | ||
280 | // Does not include checks for whether the operation is implemented. | ||
281 | // This must be done earlier, by blur_and_downsample_is_implemented. | ||
282 | 204 | auto *workspace = | |
283 | 102 | reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context); | |
284 | |||
285 |
6/6✓ Branch 0 taken 16 times.
✓ Branch 1 taken 86 times.
✓ Branch 2 taken 16 times.
✓ Branch 3 taken 86 times.
✓ Branch 4 taken 16 times.
✓ Branch 5 taken 86 times.
|
220 | if (auto check_result = |
286 | 204 | blur_and_downsample_checks(src, src_stride, src_width, src_height, | |
287 | 102 | dst, dst_stride, channels, workspace)) { | |
288 | 16 | return check_result; | |
289 | } | ||
290 | |||
291 | 86 | Rectangle rect{src_width, src_height}; | |
292 | |||
293 | 86 | Rows<const uint8_t> src_rows{src, src_stride, channels}; | |
294 | 86 | Rows<uint8_t> dst_rows{dst, dst_stride, channels}; | |
295 | 172 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
296 | 86 | fixed_border_type, BlurAndDownsample{}); | |
297 | |||
298 | 86 | return KLEIDICV_OK; | |
299 | 102 | } | |
300 | |||
301 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
302 |