KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/blur_and_downsample_sc.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 171 171 100.0%
Functions: 32 32 100.0%
Branches: 28 28 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include "kleidicv/ctypes.h"
6 #include "kleidicv/filters/blur_and_downsample.h"
7 #include "kleidicv/kleidicv.h"
8 #include "kleidicv/sve2.h"
9 #include "kleidicv/utils.h"
10 #include "kleidicv/workspace/blur_and_downsample_ws.h"
11 #include "kleidicv/workspace/border_5x5.h"
12
13 namespace KLEIDICV_TARGET_NAMESPACE {
14
15 // Applies Gaussian Blur binomial filter to even rows and columns
16 //
17 // [ 1, 4, 6, 4, 1 ] [ 1 ]
18 // [ 4, 16, 24, 16, 4 ] [ 4 ]
19 // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ]
20 // [ 4, 16, 24, 16, 4 ] [ 4 ]
21 // [ 1, 4, 6, 4, 1 ] [ 1 ]
22 class BlurAndDownsample {
23 public:
24 using SourceType = uint8_t;
25 using BufferType = uint16_t;
26 using DestinationType = uint8_t;
27 using SourceVecTraits =
28 typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>;
29 using SourceVectorType = typename SourceVecTraits::VectorType;
30 using SourceVector2Type = typename SourceVecTraits::Vector2Type;
31 using BufferVecTraits =
32 typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<BufferType>;
33 using BufferVectorType = typename BufferVecTraits::VectorType;
34 using BorderInfoType =
35 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>;
36 using BorderType = FixedBorderType;
37 using BorderOffsets = typename BorderInfoType::Offsets;
38
39 static constexpr size_t margin = 2UL;
40
41 1473 void process_vertical(size_t width, Rows<const SourceType> src_rows,
42 Rows<BufferType> dst_rows,
43 BorderOffsets border_offsets) const KLEIDICV_STREAMING {
44 1473 LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()};
45
46 1639 loop.unroll_twice([&](ptrdiff_t index) KLEIDICV_STREAMING {
47 166 svbool_t pg_all = SourceVecTraits::svptrue();
48 332 vertical_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets,
49 166 index);
50 166 });
51
52 1497 loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING {
53 24 svbool_t pg_all = SourceVecTraits::svptrue();
54 48 vertical_vector_path_1x(pg_all, src_rows, dst_rows, border_offsets,
55 24 index);
56 24 });
57
58 2922 loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING {
59 1449 svbool_t pg = SourceVecTraits::svwhilelt(index, length);
60 1449 vertical_vector_path_1x(pg, src_rows, dst_rows, border_offsets, index);
61 1449 });
62 1473 }
63
64 1473 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
65 Rows<DestinationType> dst_rows,
66 BorderOffsets border_offsets) const
67 KLEIDICV_STREAMING {
68 1473 svbool_t pg_all = BufferVecTraits::svptrue();
69 1473 LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()};
70
71 1709 loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING {
72 472 horizontal_vector_path_2x(pg_all, pg_all, src_rows, pg_all, dst_rows,
73 236 border_offsets, static_cast<ptrdiff_t>(index));
74 236 });
75
76 2112 loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING {
77 639 svbool_t pg_src_0 = BufferVecTraits::svwhilelt(index, length);
78 1278 svbool_t pg_src_1 = BufferVecTraits::svwhilelt(
79 639 index + BufferVecTraits::num_lanes(), length);
80 1278 svbool_t pg_dst =
81 639 BufferVecTraits::svwhilelt((index + 1) / 2, (length + 1) / 2);
82 1278 horizontal_vector_path_2x(pg_src_0, pg_src_1, src_rows, pg_dst, dst_rows,
83 639 border_offsets, static_cast<ptrdiff_t>(index));
84 639 });
85 1473 }
86
87 2946 void process_horizontal_borders(
88 Rows<const BufferType> src_rows, Rows<DestinationType> dst_rows,
89 BorderOffsets border_offsets) const KLEIDICV_STREAMING {
90
2/2
✓ Branch 0 taken 2946 times.
✓ Branch 1 taken 2946 times.
5892 for (ptrdiff_t index = 0;
91 5892 index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) {
92 2946 disable_loop_vectorization();
93 2946 svbool_t pg = svptrue_pat_b8(SV_VL1);
94 2946 horizontal_border_path(pg, src_rows, dst_rows, border_offsets, index);
95 2946 }
96 2946 }
97
98 private:
99 166 void vertical_vector_path_2x(svbool_t pg, Rows<const SourceType> src_rows,
100 Rows<BufferType> dst_rows,
101 BorderOffsets border_offsets,
102 ptrdiff_t index) const KLEIDICV_STREAMING {
103 166 const auto *src_row_0 = &src_rows.at(border_offsets.c0())[index];
104 166 const auto *src_row_1 = &src_rows.at(border_offsets.c1())[index];
105 166 const auto *src_row_2 = &src_rows.at(border_offsets.c2())[index];
106 166 const auto *src_row_3 = &src_rows.at(border_offsets.c3())[index];
107 166 const auto *src_row_4 = &src_rows.at(border_offsets.c4())[index];
108
109 166 SourceVector2Type src_0;
110 166 SourceVector2Type src_1;
111 166 SourceVector2Type src_2;
112 166 SourceVector2Type src_3;
113 166 SourceVector2Type src_4;
114
115 166 src_0 =
116 166 svcreate2(svld1(pg, &src_row_0[0]), svld1_vnum(pg, &src_row_0[0], 1));
117 166 src_1 =
118 166 svcreate2(svld1(pg, &src_row_1[0]), svld1_vnum(pg, &src_row_1[0], 1));
119 166 src_2 =
120 166 svcreate2(svld1(pg, &src_row_2[0]), svld1_vnum(pg, &src_row_2[0], 1));
121 166 src_3 =
122 166 svcreate2(svld1(pg, &src_row_3[0]), svld1_vnum(pg, &src_row_3[0], 1));
123 166 src_4 =
124 166 svcreate2(svld1(pg, &src_row_4[0]), svld1_vnum(pg, &src_row_4[0], 1));
125
126 332 vertical_vector_path(pg, svget2(src_0, 0), svget2(src_1, 0),
127 166 svget2(src_2, 0), svget2(src_3, 0), svget2(src_4, 0),
128 166 &dst_rows[index]);
129 332 vertical_vector_path(pg, svget2(src_0, 1), svget2(src_1, 1),
130 166 svget2(src_2, 1), svget2(src_3, 1), svget2(src_4, 1),
131 332 &dst_rows[index + static_cast<ptrdiff_t>(
132 166 SourceVecTraits::num_lanes())]);
133 166 }
134
135 1473 void vertical_vector_path_1x(svbool_t pg, Rows<const SourceType> src_rows,
136 Rows<BufferType> dst_rows,
137 BorderOffsets border_offsets,
138 ptrdiff_t index) const KLEIDICV_STREAMING {
139 2946 SourceVectorType src_0 =
140 1473 svld1(pg, &src_rows.at(border_offsets.c0())[index]);
141 2946 SourceVectorType src_1 =
142 1473 svld1(pg, &src_rows.at(border_offsets.c1())[index]);
143 2946 SourceVectorType src_2 =
144 1473 svld1(pg, &src_rows.at(border_offsets.c2())[index]);
145 2946 SourceVectorType src_3 =
146 1473 svld1(pg, &src_rows.at(border_offsets.c3())[index]);
147 2946 SourceVectorType src_4 =
148 1473 svld1(pg, &src_rows.at(border_offsets.c4())[index]);
149 2946 vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4,
150 1473 &dst_rows[index]);
151 1473 }
152
153 // Applies vertical filtering vector using SIMD operations.
154 //
155 // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
156 1805 void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1,
157 svuint8_t src_2, svuint8_t src_3, svuint8_t src_4,
158 BufferType *dst) const KLEIDICV_STREAMING {
159 1805 svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4);
160 1805 svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4);
161 1805 svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3);
162 1805 svuint16_t acc_1_3_t = svaddlt_u16(src_1, src_3);
163
164 1805 svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src_2, 6);
165 1805 svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src_2, 6);
166 1805 acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4);
167 1805 acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4);
168
169 1805 svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t);
170 1805 svst2(pg, &dst[0], interleaved);
171 1805 }
172
173 875 void horizontal_vector_path_2x(svbool_t pg_src_0, svbool_t pg_src_1,
174 Rows<const BufferType> src_rows,
175 svbool_t pg_dst,
176 Rows<DestinationType> dst_rows,
177 BorderOffsets border_offsets,
178 ptrdiff_t index) const KLEIDICV_STREAMING {
179 875 const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index];
180 875 const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index];
181 875 const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index];
182 875 const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index];
183 875 const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index];
184
185 875 BufferVectorType src_0_0 = svld1(pg_src_0, &src_0[0]);
186 875 BufferVectorType src_1_0 = svld1_vnum(pg_src_1, &src_0[0], 1);
187 875 BufferVectorType src_0_1 = svld1(pg_src_0, &src_1[0]);
188 875 BufferVectorType src_1_1 = svld1_vnum(pg_src_1, &src_1[0], 1);
189 875 BufferVectorType src_0_2 = svld1(pg_src_0, &src_2[0]);
190 875 BufferVectorType src_1_2 = svld1_vnum(pg_src_1, &src_2[0], 1);
191 875 BufferVectorType src_0_3 = svld1(pg_src_0, &src_3[0]);
192 875 BufferVectorType src_1_3 = svld1_vnum(pg_src_1, &src_3[0], 1);
193 875 BufferVectorType src_0_4 = svld1(pg_src_0, &src_4[0]);
194 875 BufferVectorType src_1_4 = svld1_vnum(pg_src_1, &src_4[0], 1);
195
196 1750 svuint16_t res_0 = horizontal_vector_path(pg_src_0, src_0_0, src_0_1,
197 875 src_0_2, src_0_3, src_0_4);
198 1750 svuint16_t res_1 = horizontal_vector_path(pg_src_1, src_1_0, src_1_1,
199 875 src_1_2, src_1_3, src_1_4);
200
201 875 svuint16_t res_even_only = svuzp1(res_0, res_1);
202 875 svst1b(pg_dst, &dst_rows[index / 2], res_even_only);
203 875 }
204
205 // Applies horizontal filtering vector using SIMD operations.
206 //
207 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
208 1750 svuint16_t horizontal_vector_path(svbool_t pg, svuint16_t src_0,
209 svuint16_t src_1, svuint16_t src_2,
210 svuint16_t src_3,
211 svuint16_t src_4) const KLEIDICV_STREAMING {
212 1750 svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4);
213 1750 svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3);
214 1750 svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6);
215 1750 acc = svmla_n_u16_x(pg, acc, acc_1_3, 4);
216 1750 acc = svrshr_x(pg, acc, 8);
217 3500 return acc;
218 1750 }
219
220 // Applies horizontal filtering for the borders using SIMD operations.
221 //
222 // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T
223 2946 void horizontal_border_path(svbool_t pg, Rows<const BufferType> src_rows,
224 Rows<DestinationType> dst_rows,
225 BorderOffsets border_offsets,
226 ptrdiff_t index) const KLEIDICV_STREAMING {
227 5892 BufferVectorType src_0 =
228 2946 svld1(pg, &src_rows.at(0, border_offsets.c0())[index]);
229 5892 BufferVectorType src_1 =
230 2946 svld1(pg, &src_rows.at(0, border_offsets.c1())[index]);
231 5892 BufferVectorType src_2 =
232 2946 svld1(pg, &src_rows.at(0, border_offsets.c2())[index]);
233 5892 BufferVectorType src_3 =
234 2946 svld1(pg, &src_rows.at(0, border_offsets.c3())[index]);
235 5892 BufferVectorType src_4 =
236 2946 svld1(pg, &src_rows.at(0, border_offsets.c4())[index]);
237
238 2946 svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4);
239 2946 svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3);
240 2946 svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6);
241 2946 acc = svmla_n_u16_x(pg, acc, acc_1_3, 4);
242 2946 acc = svrshr_x(pg, acc, 8);
243
244 2946 svst1b(pg, &dst_rows[index / 2], acc);
245 2946 }
246 }; // end of class BlurAndDownsample
247
248 // Does not include checks for whether the operation is implemented.
249 // This must be done earlier, by blur_and_downsample_is_implemented.
250 153 static kleidicv_error_t blur_and_downsample_checks(
251 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
252 uint8_t *dst, size_t dst_stride, size_t channels,
253 BlurAndDownsampleFilterWorkspace *workspace) KLEIDICV_STREAMING {
254
2/2
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 150 times.
153 CHECK_POINTERS(workspace);
255
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 147 times.
150 CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
256
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 144 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 144 times.
147 CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2);
257
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 138 times.
144 CHECK_IMAGE_SIZE(src_width, src_height);
258
259 138 Rectangle rect{src_width, src_height};
260 138 const Rectangle &context_rect = workspace->image_size();
261
4/4
✓ Branch 0 taken 132 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 129 times.
138 if (context_rect.width() < src_width || context_rect.height() < src_height) {
262 9 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
263 }
264
265 // Currently supports only one channel, so it cannot be tested.
266 // GCOVR_EXCL_START
267 if (workspace->channels() < channels) {
268 return KLEIDICV_ERROR_CONTEXT_MISMATCH;
269 }
270 // GCOVR_EXCL_STOP
271
272 129 return KLEIDICV_OK;
273 153 }
274
275 153 static kleidicv_error_t blur_and_downsample_stripe_u8_sc(
276 const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
277 uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end,
278 size_t channels, FixedBorderType fixed_border_type,
279 kleidicv_filter_context_t *context) KLEIDICV_STREAMING {
280 // Does not include checks for whether the operation is implemented.
281 // This must be done earlier, by blur_and_downsample_is_implemented.
282 306 auto *workspace =
283 153 reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context);
284
285
6/6
✓ Branch 0 taken 24 times.
✓ Branch 1 taken 129 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 129 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 129 times.
330 if (auto check_result =
286 306 blur_and_downsample_checks(src, src_stride, src_width, src_height,
287 153 dst, dst_stride, channels, workspace)) {
288 24 return check_result;
289 }
290
291 129 Rectangle rect{src_width, src_height};
292
293 129 Rows<const uint8_t> src_rows{src, src_stride, channels};
294 129 Rows<uint8_t> dst_rows{dst, dst_stride, channels};
295 258 workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels,
296 129 fixed_border_type, BlurAndDownsample{});
297
298 129 return KLEIDICV_OK;
299 153 }
300
301 } // namespace KLEIDICV_TARGET_NAMESPACE
302