KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 233 233 100.0%
Functions: 26 26 100.0%
Branches: 2 2 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_21X21_SC_H
6 #define KLEIDICV_SEPARABLE_FILTER_21X21_SC_H
7
8 #include "kleidicv/sve2.h"
9 #include "kleidicv/workspace/border_21x21.h"
10
11 // It is used by SVE2 and SME, the actual namespace will reflect it.
12 namespace KLEIDICV_TARGET_NAMESPACE {
13
14 // Template for drivers of separable NxM filters.
15 template <typename FilterType, const size_t S>
16 class SeparableFilter;
17
18 // Driver for a separable 21x21 filter.
19 template <typename FilterType>
20 class SeparableFilter<FilterType, 21UL> {
21 public:
22 using SourceType = typename FilterType::SourceType;
23 using BufferType = typename FilterType::BufferType;
24 using DestinationType = typename FilterType::DestinationType;
25 using SourceVecTraits =
26 typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>;
27 using SourceVectorType = typename SourceVecTraits::VectorType;
28 using BufferVecTraits =
29 typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<BufferType>;
30 using BufferVectorType = typename BufferVecTraits::VectorType;
31 using BorderInfoType =
32 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo21x21<SourceType>;
33 using BorderType = FixedBorderType;
34 using BorderOffsets = typename BorderInfoType::Offsets;
35
36 96 explicit SeparableFilter(FilterType filter) KLEIDICV_STREAMING
37 96 : filter_{filter} {}
38
39 static constexpr size_t margin = 10UL;
40
41 1944 void process_vertical(size_t width, Rows<const SourceType> src_rows,
42 Rows<BufferType> dst_rows,
43 BorderOffsets border_offsets) const KLEIDICV_STREAMING {
44 1944 LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()};
45
46 4512 loop.unroll_once([&](size_t index) KLEIDICV_STREAMING {
47 2568 svbool_t pg_all = SourceVecTraits::svptrue();
48 2568 vertical_vector_path(pg_all, src_rows, dst_rows, border_offsets, index);
49 2568 });
50
51 3888 loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING {
52 1944 svbool_t pg = SourceVecTraits::svwhilelt(index, length);
53 1944 vertical_vector_path(pg, src_rows, dst_rows, border_offsets, index);
54 1944 });
55 1944 }
56
57 1944 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
58 Rows<DestinationType> dst_rows,
59 BorderOffsets border_offsets) const
60 KLEIDICV_STREAMING {
61 1944 svbool_t pg_all = BufferVecTraits::svptrue();
62 1944 LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()};
63
64 2424 loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING {
65 960 horizontal_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets,
66 480 index);
67 480 });
68
69 2424 loop.unroll_once([&](size_t index) KLEIDICV_STREAMING {
70 480 horizontal_vector_path(pg_all, src_rows, dst_rows, border_offsets, index);
71 480 });
72
73 3408 loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING {
74 1464 svbool_t pg = BufferVecTraits::svwhilelt(index, length);
75 1464 horizontal_vector_path(pg, src_rows, dst_rows, border_offsets, index);
76 1464 });
77 1944 }
78
79 // Processing of horizontal borders is always scalar because border offsets
80 // change for each and every element in the border.
81 38880 void process_horizontal_borders(
82 Rows<const BufferType> src_rows, Rows<DestinationType> dst_rows,
83 BorderOffsets border_offsets) const KLEIDICV_STREAMING {
84
2/2
✓ Branch 0 taken 38880 times.
✓ Branch 1 taken 67680 times.
106560 for (size_t index = 0; index < src_rows.channels(); ++index) {
85 67680 disable_loop_vectorization();
86 67680 process_horizontal_border(src_rows, dst_rows, border_offsets, index);
87 67680 }
88 38880 }
89
90 private:
91 4512 void vertical_vector_path(svbool_t pg, Rows<const SourceType> src_rows,
92 Rows<BufferType> dst_rows,
93 BorderOffsets border_offsets,
94 size_t index) const KLEIDICV_STREAMING {
95 9024 SourceVectorType src_0 =
96 4512 svld1(pg, &src_rows.at(border_offsets.c0())[index]);
97 9024 SourceVectorType src_1 =
98 4512 svld1(pg, &src_rows.at(border_offsets.c1())[index]);
99 9024 SourceVectorType src_2 =
100 4512 svld1(pg, &src_rows.at(border_offsets.c2())[index]);
101 9024 SourceVectorType src_3 =
102 4512 svld1(pg, &src_rows.at(border_offsets.c3())[index]);
103 9024 SourceVectorType src_4 =
104 4512 svld1(pg, &src_rows.at(border_offsets.c4())[index]);
105 9024 SourceVectorType src_5 =
106 4512 svld1(pg, &src_rows.at(border_offsets.c5())[index]);
107 9024 SourceVectorType src_6 =
108 4512 svld1(pg, &src_rows.at(border_offsets.c6())[index]);
109 9024 SourceVectorType src_7 =
110 4512 svld1(pg, &src_rows.at(border_offsets.c7())[index]);
111 9024 SourceVectorType src_8 =
112 4512 svld1(pg, &src_rows.at(border_offsets.c8())[index]);
113 9024 SourceVectorType src_9 =
114 4512 svld1(pg, &src_rows.at(border_offsets.c9())[index]);
115 9024 SourceVectorType src_10 =
116 4512 svld1(pg, &src_rows.at(border_offsets.c10())[index]);
117 9024 SourceVectorType src_11 =
118 4512 svld1(pg, &src_rows.at(border_offsets.c11())[index]);
119 9024 SourceVectorType src_12 =
120 4512 svld1(pg, &src_rows.at(border_offsets.c12())[index]);
121 9024 SourceVectorType src_13 =
122 4512 svld1(pg, &src_rows.at(border_offsets.c13())[index]);
123 9024 SourceVectorType src_14 =
124 4512 svld1(pg, &src_rows.at(border_offsets.c14())[index]);
125 9024 SourceVectorType src_15 =
126 4512 svld1(pg, &src_rows.at(border_offsets.c15())[index]);
127 9024 SourceVectorType src_16 =
128 4512 svld1(pg, &src_rows.at(border_offsets.c16())[index]);
129 9024 SourceVectorType src_17 =
130 4512 svld1(pg, &src_rows.at(border_offsets.c17())[index]);
131 9024 SourceVectorType src_18 =
132 4512 svld1(pg, &src_rows.at(border_offsets.c18())[index]);
133 9024 SourceVectorType src_19 =
134 4512 svld1(pg, &src_rows.at(border_offsets.c19())[index]);
135 9024 SourceVectorType src_20 =
136 4512 svld1(pg, &src_rows.at(border_offsets.c20())[index]);
137 94752 std::reference_wrapper<SourceVectorType> sources[21] = {
138 31584 src_0, src_1, src_2, src_3, src_4, src_5, src_6,
139 31584 src_7, src_8, src_9, src_10, src_11, src_12, src_13,
140 31584 src_14, src_15, src_16, src_17, src_18, src_19, src_20};
141 4512 filter_.vertical_vector_path(pg, sources, &dst_rows[index]);
142 4512 }
143
144 480 void horizontal_vector_path_2x(svbool_t pg, Rows<const BufferType> src_rows,
145 Rows<DestinationType> dst_rows,
146 BorderOffsets border_offsets,
147 size_t index) const KLEIDICV_STREAMING {
148 480 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
149 480 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
150 480 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
151 480 auto src_3 = &src_rows.at(0, border_offsets.c3())[index];
152 480 auto src_4 = &src_rows.at(0, border_offsets.c4())[index];
153 480 auto src_5 = &src_rows.at(0, border_offsets.c5())[index];
154 480 auto src_6 = &src_rows.at(0, border_offsets.c6())[index];
155 480 auto src_7 = &src_rows.at(0, border_offsets.c7())[index];
156 480 auto src_8 = &src_rows.at(0, border_offsets.c8())[index];
157 480 auto src_9 = &src_rows.at(0, border_offsets.c9())[index];
158 480 auto src_10 = &src_rows.at(0, border_offsets.c10())[index];
159 480 auto src_11 = &src_rows.at(0, border_offsets.c11())[index];
160 480 auto src_12 = &src_rows.at(0, border_offsets.c12())[index];
161 480 auto src_13 = &src_rows.at(0, border_offsets.c13())[index];
162 480 auto src_14 = &src_rows.at(0, border_offsets.c14())[index];
163 480 auto src_15 = &src_rows.at(0, border_offsets.c15())[index];
164 480 auto src_16 = &src_rows.at(0, border_offsets.c16())[index];
165 480 auto src_17 = &src_rows.at(0, border_offsets.c17())[index];
166 480 auto src_18 = &src_rows.at(0, border_offsets.c18())[index];
167 480 auto src_19 = &src_rows.at(0, border_offsets.c19())[index];
168 480 auto src_20 = &src_rows.at(0, border_offsets.c20())[index];
169
170 480 BufferVectorType src_0_0 = svld1(pg, &src_0[0]);
171 480 BufferVectorType src_1_0 = svld1_vnum(pg, &src_0[0], 1);
172 480 BufferVectorType src_0_1 = svld1(pg, &src_1[0]);
173 480 BufferVectorType src_1_1 = svld1_vnum(pg, &src_1[0], 1);
174 480 BufferVectorType src_0_2 = svld1(pg, &src_2[0]);
175 480 BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1);
176 480 BufferVectorType src_0_3 = svld1(pg, &src_3[0]);
177 480 BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1);
178 480 BufferVectorType src_0_4 = svld1(pg, &src_4[0]);
179 480 BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1);
180 480 BufferVectorType src_0_5 = svld1(pg, &src_5[0]);
181 480 BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1);
182 480 BufferVectorType src_0_6 = svld1(pg, &src_6[0]);
183 480 BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1);
184 480 BufferVectorType src_0_7 = svld1(pg, &src_7[0]);
185 480 BufferVectorType src_1_7 = svld1_vnum(pg, &src_7[0], 1);
186 480 BufferVectorType src_0_8 = svld1(pg, &src_8[0]);
187 480 BufferVectorType src_1_8 = svld1_vnum(pg, &src_8[0], 1);
188 480 BufferVectorType src_0_9 = svld1(pg, &src_9[0]);
189 480 BufferVectorType src_1_9 = svld1_vnum(pg, &src_9[0], 1);
190 480 BufferVectorType src_0_10 = svld1(pg, &src_10[0]);
191 480 BufferVectorType src_1_10 = svld1_vnum(pg, &src_10[0], 1);
192 480 BufferVectorType src_0_11 = svld1(pg, &src_11[0]);
193 480 BufferVectorType src_1_11 = svld1_vnum(pg, &src_11[0], 1);
194 480 BufferVectorType src_0_12 = svld1(pg, &src_12[0]);
195 480 BufferVectorType src_1_12 = svld1_vnum(pg, &src_12[0], 1);
196 480 BufferVectorType src_0_13 = svld1(pg, &src_13[0]);
197 480 BufferVectorType src_1_13 = svld1_vnum(pg, &src_13[0], 1);
198 480 BufferVectorType src_0_14 = svld1(pg, &src_14[0]);
199 480 BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1);
200 480 BufferVectorType src_0_15 = svld1(pg, &src_15[0]);
201 480 BufferVectorType src_1_15 = svld1_vnum(pg, &src_15[0], 1);
202 480 BufferVectorType src_0_16 = svld1(pg, &src_16[0]);
203 480 BufferVectorType src_1_16 = svld1_vnum(pg, &src_16[0], 1);
204 480 BufferVectorType src_0_17 = svld1(pg, &src_17[0]);
205 480 BufferVectorType src_1_17 = svld1_vnum(pg, &src_17[0], 1);
206 480 BufferVectorType src_0_18 = svld1(pg, &src_18[0]);
207 480 BufferVectorType src_1_18 = svld1_vnum(pg, &src_18[0], 1);
208 480 BufferVectorType src_0_19 = svld1(pg, &src_19[0]);
209 480 BufferVectorType src_1_19 = svld1_vnum(pg, &src_19[0], 1);
210 480 BufferVectorType src_0_20 = svld1(pg, &src_20[0]);
211 480 BufferVectorType src_1_20 = svld1_vnum(pg, &src_20[0], 1);
212
213 10080 std::reference_wrapper<BufferVectorType> sources_0[21] = {
214 3360 src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6,
215 3360 src_0_7, src_0_8, src_0_9, src_0_10, src_0_11, src_0_12, src_0_13,
216 3360 src_0_14, src_0_15, src_0_16, src_0_17, src_0_18, src_0_19, src_0_20};
217 480 filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]);
218 10080 std::reference_wrapper<BufferVectorType> sources_1[21] = {
219 3360 src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6,
220 3360 src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13,
221 3360 src_1_14, src_1_15, src_1_16, src_1_17, src_1_18, src_1_19, src_1_20};
222 960 filter_.horizontal_vector_path(
223 480 pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]);
224 480 }
225
226 1944 void horizontal_vector_path(svbool_t pg, Rows<const BufferType> src_rows,
227 Rows<DestinationType> dst_rows,
228 BorderOffsets border_offsets,
229 size_t index) const KLEIDICV_STREAMING {
230 3888 BufferVectorType src_0 =
231 1944 svld1(pg, &src_rows.at(0, border_offsets.c0())[index]);
232 3888 BufferVectorType src_1 =
233 1944 svld1(pg, &src_rows.at(0, border_offsets.c1())[index]);
234 3888 BufferVectorType src_2 =
235 1944 svld1(pg, &src_rows.at(0, border_offsets.c2())[index]);
236 3888 BufferVectorType src_3 =
237 1944 svld1(pg, &src_rows.at(0, border_offsets.c3())[index]);
238 3888 BufferVectorType src_4 =
239 1944 svld1(pg, &src_rows.at(0, border_offsets.c4())[index]);
240 3888 BufferVectorType src_5 =
241 1944 svld1(pg, &src_rows.at(0, border_offsets.c5())[index]);
242 3888 BufferVectorType src_6 =
243 1944 svld1(pg, &src_rows.at(0, border_offsets.c6())[index]);
244 3888 BufferVectorType src_7 =
245 1944 svld1(pg, &src_rows.at(0, border_offsets.c7())[index]);
246 3888 BufferVectorType src_8 =
247 1944 svld1(pg, &src_rows.at(0, border_offsets.c8())[index]);
248 3888 BufferVectorType src_9 =
249 1944 svld1(pg, &src_rows.at(0, border_offsets.c9())[index]);
250 3888 BufferVectorType src_10 =
251 1944 svld1(pg, &src_rows.at(0, border_offsets.c10())[index]);
252 3888 BufferVectorType src_11 =
253 1944 svld1(pg, &src_rows.at(0, border_offsets.c11())[index]);
254 3888 BufferVectorType src_12 =
255 1944 svld1(pg, &src_rows.at(0, border_offsets.c12())[index]);
256 3888 BufferVectorType src_13 =
257 1944 svld1(pg, &src_rows.at(0, border_offsets.c13())[index]);
258 3888 BufferVectorType src_14 =
259 1944 svld1(pg, &src_rows.at(0, border_offsets.c14())[index]);
260 3888 BufferVectorType src_15 =
261 1944 svld1(pg, &src_rows.at(0, border_offsets.c15())[index]);
262 3888 BufferVectorType src_16 =
263 1944 svld1(pg, &src_rows.at(0, border_offsets.c16())[index]);
264 3888 BufferVectorType src_17 =
265 1944 svld1(pg, &src_rows.at(0, border_offsets.c17())[index]);
266 3888 BufferVectorType src_18 =
267 1944 svld1(pg, &src_rows.at(0, border_offsets.c18())[index]);
268 3888 BufferVectorType src_19 =
269 1944 svld1(pg, &src_rows.at(0, border_offsets.c19())[index]);
270 3888 BufferVectorType src_20 =
271 1944 svld1(pg, &src_rows.at(0, border_offsets.c20())[index]);
272
273 40824 std::reference_wrapper<BufferVectorType> sources[21] = {
274 13608 src_0, src_1, src_2, src_3, src_4, src_5, src_6,
275 13608 src_7, src_8, src_9, src_10, src_11, src_12, src_13,
276 13608 src_14, src_15, src_16, src_17, src_18, src_19, src_20};
277 1944 filter_.horizontal_vector_path(pg, sources, &dst_rows[index]);
278 1944 }
279
280 67680 void process_horizontal_border(Rows<const BufferType> src_rows,
281 Rows<DestinationType> dst_rows,
282 BorderOffsets border_offsets,
283 size_t index) const KLEIDICV_STREAMING {
284 67680 BufferType src[21];
285 67680 src[0] = src_rows.at(0, border_offsets.c0())[index];
286 67680 src[1] = src_rows.at(0, border_offsets.c1())[index];
287 67680 src[2] = src_rows.at(0, border_offsets.c2())[index];
288 67680 src[3] = src_rows.at(0, border_offsets.c3())[index];
289 67680 src[4] = src_rows.at(0, border_offsets.c4())[index];
290 67680 src[5] = src_rows.at(0, border_offsets.c5())[index];
291 67680 src[6] = src_rows.at(0, border_offsets.c6())[index];
292 67680 src[7] = src_rows.at(0, border_offsets.c7())[index];
293 67680 src[8] = src_rows.at(0, border_offsets.c8())[index];
294 67680 src[9] = src_rows.at(0, border_offsets.c9())[index];
295 67680 src[10] = src_rows.at(0, border_offsets.c10())[index];
296 67680 src[11] = src_rows.at(0, border_offsets.c11())[index];
297 67680 src[12] = src_rows.at(0, border_offsets.c12())[index];
298 67680 src[13] = src_rows.at(0, border_offsets.c13())[index];
299 67680 src[14] = src_rows.at(0, border_offsets.c14())[index];
300 67680 src[15] = src_rows.at(0, border_offsets.c15())[index];
301 67680 src[16] = src_rows.at(0, border_offsets.c16())[index];
302 67680 src[17] = src_rows.at(0, border_offsets.c17())[index];
303 67680 src[18] = src_rows.at(0, border_offsets.c18())[index];
304 67680 src[19] = src_rows.at(0, border_offsets.c19())[index];
305 67680 src[20] = src_rows.at(0, border_offsets.c20())[index];
306 67680 filter_.horizontal_scalar_path(src, &dst_rows[index]);
307 67680 }
308
309 FilterType filter_;
310 }; // end of class SeparableFilter<FilterType, 21UL>
311
312 // Shorthand for 21x21 separable filters driver type.
313 template <class FilterType>
314 using SeparableFilter21x21 = SeparableFilter<FilterType, 21UL>;
315
316 } // namespace KLEIDICV_TARGET_NAMESPACE
317
318 #endif // KLEIDICV_SEPARABLE_FILTER_21X21_SC_H
319