Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_FILTER_2D_SC_H | ||
6 | #define KLEIDICV_FILTER_2D_SC_H | ||
7 | |||
8 | #include "filter_2d_window_loader_3x3.h" | ||
9 | #include "filter_2d_window_loader_5x5.h" | ||
10 | #include "filter_2d_window_loader_7x7.h" | ||
11 | #include "kleidicv/sve2.h" | ||
12 | #include "process_filter_2d.h" | ||
13 | |||
14 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
15 | |||
16 | template <typename SourceType, typename DestinationType, | ||
17 | typename WindowLoaderType> | ||
18 | class Filter2D3x3VectorOperations { | ||
19 | public: | ||
20 | using BorderInfoType = | ||
21 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, 3>; | ||
22 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
23 | |||
24 | template <typename InnerFilterType, typename SourceVectorType, | ||
25 | typename DestinationVectorType> | ||
26 | 3708 | static void process_one_element_with_vector_operation( | |
27 | svbool_t pg, Rows<const SourceType> src_rows, | ||
28 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
29 | BorderOffsets window_col_offsets, size_t index, | ||
30 | const InnerFilterType& filter_) KLEIDICV_STREAMING { | ||
31 | 3708 | SourceVectorType src_0_0, src_0_1, src_0_2, src_1_0, src_1_1, src_1_2, | |
32 | src_2_0, src_2_1, src_2_2; | ||
33 | 3708 | DestinationVectorType dst_vec; | |
34 | 11124 | ScalableVectorArray2D<SourceVectorType, 3, 3> KernelWindow = {{ | |
35 | 3708 | {std::ref(src_0_0), std::ref(src_0_1), std::ref(src_0_2)}, | |
36 | 3708 | {std::ref(src_1_0), std::ref(src_1_1), std::ref(src_1_2)}, | |
37 | 3708 | {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2)}, | |
38 | }}; | ||
39 | |||
40 | 37080 | auto load_array_element = [&](const SourceType& x) | |
41 | 33372 | KLEIDICV_STREAMING { return svld1(pg, &x); }; | |
42 | |||
43 | 7416 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
44 | 3708 | window_row_offsets, window_col_offsets, | |
45 | 3708 | index); | |
46 | 3708 | filter_.vector_path(pg, KernelWindow, dst_vec); | |
47 | |||
48 | 3708 | svst1(pg, &dst_rows[index], dst_vec); | |
49 | 3708 | } | |
50 | |||
51 | template <typename InnerFilterType, typename SourceVectorType, | ||
52 | typename DestinationVectorType> | ||
53 | 17638 | static void process_two_elements_with_vector_operation( | |
54 | svbool_t pg, Rows<const SourceType> src_rows, | ||
55 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets_0, | ||
56 | BorderOffsets window_row_offsets_1, BorderOffsets window_col_offsets, | ||
57 | size_t index, const InnerFilterType& filter_) KLEIDICV_STREAMING { | ||
58 | 17638 | SourceVectorType src_0_0, src_0_1, src_0_2, src_1_0, src_1_1, src_1_2, | |
59 | src_2_0, src_2_1, src_2_2, src_3_0, src_3_1, src_3_2; | ||
60 | 17638 | DestinationVectorType dst_vec_0, dst_vec_1; | |
61 | 70552 | ScalableVectorArray2D<SourceVectorType, 4, 3> KernelWindow = {{ | |
62 | 17638 | {std::ref(src_0_0), std::ref(src_0_1), std::ref(src_0_2)}, | |
63 | 17638 | {std::ref(src_1_0), std::ref(src_1_1), std::ref(src_1_2)}, | |
64 | 17638 | {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2)}, | |
65 | 17638 | {std::ref(src_3_0), std::ref(src_3_1), std::ref(src_3_2)}, | |
66 | }}; | ||
67 | |||
68 | 229294 | auto load_array_element = [&](const SourceType& x) | |
69 | 211656 | KLEIDICV_STREAMING { return svld1(pg, &x); }; | |
70 | |||
71 | 17638 | WindowLoaderType::load_window_to_handle_dual_rows( | |
72 | 17638 | KernelWindow, load_array_element, src_rows, window_row_offsets_0, | |
73 | 17638 | window_row_offsets_1, window_col_offsets, index); | |
74 | |||
75 | 17638 | filter_.vector_path_for_dual_row_handling(pg, KernelWindow, dst_vec_0, | |
76 | dst_vec_1); | ||
77 | 17638 | svst1(pg, &dst_rows.at(0, 0)[index], dst_vec_0); | |
78 | 17638 | svst1(pg, &dst_rows.at(1, 0)[index], dst_vec_1); | |
79 | 17638 | } | |
80 | }; | ||
81 | |||
82 | template <typename SourceType, typename DestinationType, | ||
83 | typename WindowLoaderType> | ||
84 | class Filter2D5x5VectorOperations { | ||
85 | public: | ||
86 | using BorderInfoType = | ||
87 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, 5>; | ||
88 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
89 | |||
90 | template <typename InnerFilterType, typename SourceVectorType, | ||
91 | typename DestinationVectorType> | ||
92 | 75826 | static void process_one_element_with_vector_operation( | |
93 | svbool_t pg, Rows<const SourceType> src_rows, | ||
94 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
95 | BorderOffsets window_col_offsets, size_t index, | ||
96 | const InnerFilterType& filter_) KLEIDICV_STREAMING { | ||
97 | 75826 | SourceVectorType src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_1_0, | |
98 | src_1_1, src_1_2, src_1_3, src_1_4, src_2_0, src_2_1, src_2_2, src_2_3, | ||
99 | src_2_4, src_3_0, src_3_1, src_3_2, src_3_3, src_3_4, src_4_0, src_4_1, | ||
100 | src_4_2, src_4_3, src_4_4; | ||
101 | 75826 | DestinationVectorType output_vector; | |
102 | // Initialization | ||
103 | 379130 | ScalableVectorArray2D<SourceVectorType, 5, 5> KernelWindow = {{ | |
104 | 151652 | {std::ref(src_0_0), std::ref(src_0_1), std::ref(src_0_2), | |
105 | 151652 | std::ref(src_0_3), std::ref(src_0_4)}, | |
106 | 151652 | {std::ref(src_1_0), std::ref(src_1_1), std::ref(src_1_2), | |
107 | 151652 | std::ref(src_1_3), std::ref(src_1_4)}, | |
108 | 151652 | {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2), | |
109 | 151652 | std::ref(src_2_3), std::ref(src_2_4)}, | |
110 | 151652 | {std::ref(src_3_0), std::ref(src_3_1), std::ref(src_3_2), | |
111 | 151652 | std::ref(src_3_3), std::ref(src_3_4)}, | |
112 | 151652 | {std::ref(src_4_0), std::ref(src_4_1), std::ref(src_4_2), | |
113 | 151652 | std::ref(src_4_3), std::ref(src_4_4)}, | |
114 | }}; | ||
115 | |||
116 | 1971476 | auto load_array_element = [&](const SourceType& x) | |
117 | 1895650 | KLEIDICV_STREAMING { return svld1(pg, &x); }; | |
118 | |||
119 | 151652 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
120 | 75826 | window_row_offsets, window_col_offsets, | |
121 | 75826 | index); | |
122 | 75826 | filter_.vector_path(pg, KernelWindow, output_vector); | |
123 | 75826 | svst1(pg, &dst_rows[index], output_vector); | |
124 | 75826 | } | |
125 | }; | ||
126 | |||
127 | template <typename SourceType, typename DestinationType, | ||
128 | typename WindowLoaderType> | ||
129 | class Filter2D7x7VectorOperations { | ||
130 | public: | ||
131 | using BorderInfoType = | ||
132 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, 7>; | ||
133 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
134 | |||
135 | template <typename InnerFilterType, typename SourceVectorType, | ||
136 | typename DestinationVectorType> | ||
137 | 121900 | static void process_one_element_with_vector_operation( | |
138 | svbool_t pg, Rows<const SourceType> src_rows, | ||
139 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
140 | BorderOffsets window_col_offsets, size_t index, | ||
141 | const InnerFilterType& filter_) KLEIDICV_STREAMING { | ||
142 | 121900 | SourceVectorType src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, | |
143 | src_0_6, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, | ||
144 | src_2_0, src_2_1, src_2_2, src_2_3, src_2_4, src_2_5, src_2_6, src_3_0, | ||
145 | src_3_1, src_3_2, src_3_3, src_3_4, src_3_5, src_3_6, src_4_0, src_4_1, | ||
146 | src_4_2, src_4_3, src_4_4, src_4_5, src_4_6, src_5_0, src_5_1, src_5_2, | ||
147 | src_5_3, src_5_4, src_5_5, src_5_6, src_6_0, src_6_1, src_6_2, src_6_3, | ||
148 | src_6_4, src_6_5, src_6_6; | ||
149 | 121900 | DestinationVectorType output_vector; | |
150 | |||
151 | // Initialization | ||
152 | 853300 | ScalableVectorArray2D<SourceVectorType, 7, 7> KernelWindow = {{ | |
153 | 487600 | {std::ref(src_0_0), std::ref(src_0_1), std::ref(src_0_2), | |
154 | 365700 | std::ref(src_0_3), std::ref(src_0_4), std::ref(src_0_5), | |
155 | 121900 | std::ref(src_0_6)}, | |
156 | 487600 | {std::ref(src_1_0), std::ref(src_1_1), std::ref(src_1_2), | |
157 | 365700 | std::ref(src_1_3), std::ref(src_1_4), std::ref(src_1_5), | |
158 | 121900 | std::ref(src_1_6)}, | |
159 | 487600 | {std::ref(src_2_0), std::ref(src_2_1), std::ref(src_2_2), | |
160 | 365700 | std::ref(src_2_3), std::ref(src_2_4), std::ref(src_2_5), | |
161 | 121900 | std::ref(src_2_6)}, | |
162 | 487600 | {std::ref(src_3_0), std::ref(src_3_1), std::ref(src_3_2), | |
163 | 365700 | std::ref(src_3_3), std::ref(src_3_4), std::ref(src_3_5), | |
164 | 121900 | std::ref(src_3_6)}, | |
165 | 487600 | {std::ref(src_4_0), std::ref(src_4_1), std::ref(src_4_2), | |
166 | 365700 | std::ref(src_4_3), std::ref(src_4_4), std::ref(src_4_5), | |
167 | 121900 | std::ref(src_4_6)}, | |
168 | 487600 | {std::ref(src_5_0), std::ref(src_5_1), std::ref(src_5_2), | |
169 | 365700 | std::ref(src_5_3), std::ref(src_5_4), std::ref(src_5_5), | |
170 | 121900 | std::ref(src_5_6)}, | |
171 | 487600 | {std::ref(src_6_0), std::ref(src_6_1), std::ref(src_6_2), | |
172 | 365700 | std::ref(src_6_3), std::ref(src_6_4), std::ref(src_6_5), | |
173 | 121900 | std::ref(src_6_6)}, | |
174 | }}; | ||
175 | |||
176 | 6095000 | auto load_array_element = [&](const SourceType& x) | |
177 | 5973100 | KLEIDICV_STREAMING { return svld1(pg, &x); }; | |
178 | |||
179 | 243800 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
180 | 121900 | window_row_offsets, window_col_offsets, | |
181 | 121900 | index); | |
182 | 121900 | filter_.vector_path(pg, KernelWindow, output_vector); | |
183 | 121900 | svst1(pg, &dst_rows[index], output_vector); | |
184 | 121900 | } | |
185 | }; | ||
186 | |||
187 | template <typename InnerFilterType, size_t KSize, | ||
188 | typename VectorOperationProviderType> | ||
189 | class Filter2d { | ||
190 | public: | ||
191 | using SourceType = typename InnerFilterType::SourceType; | ||
192 | using DestinationType = typename InnerFilterType::DestinationType; | ||
193 | using SourceVecTraits = | ||
194 | typename KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>; | ||
195 | using DestinationVecTraits = | ||
196 | typename KLEIDICV_TARGET_NAMESPACE::VecTraits<DestinationType>; | ||
197 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
198 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
199 | using BorderInfoType = | ||
200 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, KSize>; | ||
201 | using BorderType = FixedBorderType; | ||
202 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
203 | static constexpr size_t kMargin = KSize / 2UL; | ||
204 | 3220 | explicit Filter2d(InnerFilterType filter) KLEIDICV_STREAMING | |
205 | 3220 | : filter_{filter} {} | |
206 | |||
207 | 17352 | void process_pixels_without_horizontal_borders( | |
208 | size_t width, Rows<const SourceType> src_rows, | ||
209 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
210 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
211 | 17352 | LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; | |
212 | |||
213 | 44748 | loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
214 | 27396 | svbool_t pg = SourceVecTraits::svptrue(); | |
215 | 27396 | VectorOperationProviderType:: | |
216 | template process_one_element_with_vector_operation< | ||
217 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
218 | 27396 | pg, src_rows, dst_rows, window_row_offsets, window_col_offsets, | |
219 | 27396 | index, filter_); | |
220 | 27396 | }); | |
221 | |||
222 | 26350 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
223 | 8998 | svbool_t pg = SourceVecTraits::svwhilelt(index, length); | |
224 | 8998 | VectorOperationProviderType:: | |
225 | template process_one_element_with_vector_operation< | ||
226 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
227 | 8998 | pg, src_rows, dst_rows, window_row_offsets, window_col_offsets, | |
228 | 8998 | index, filter_); | |
229 | 8998 | }); | |
230 | 17352 | } | |
231 | |||
232 | 85408 | void process_one_pixel_with_horizontal_borders( | |
233 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
234 | BorderOffsets window_row_offsets, | ||
235 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
236 |
42/42✓ Branch 0 taken 128 times.
✓ Branch 1 taken 320 times.
✓ Branch 2 taken 2464 times.
✓ Branch 3 taken 6160 times.
✓ Branch 4 taken 4992 times.
✓ Branch 5 taken 12480 times.
✓ Branch 6 taken 144 times.
✓ Branch 7 taken 336 times.
✓ Branch 8 taken 6544 times.
✓ Branch 9 taken 10240 times.
✓ Branch 10 taken 8976 times.
✓ Branch 11 taken 16464 times.
✓ Branch 12 taken 144 times.
✓ Branch 13 taken 336 times.
✓ Branch 14 taken 6544 times.
✓ Branch 15 taken 10240 times.
✓ Branch 16 taken 8976 times.
✓ Branch 17 taken 16464 times.
✓ Branch 18 taken 144 times.
✓ Branch 19 taken 336 times.
✓ Branch 20 taken 6544 times.
✓ Branch 21 taken 10240 times.
✓ Branch 22 taken 8976 times.
✓ Branch 23 taken 16464 times.
✓ Branch 24 taken 128 times.
✓ Branch 25 taken 320 times.
✓ Branch 26 taken 2464 times.
✓ Branch 27 taken 6160 times.
✓ Branch 28 taken 4992 times.
✓ Branch 29 taken 12480 times.
✓ Branch 30 taken 128 times.
✓ Branch 31 taken 320 times.
✓ Branch 32 taken 2464 times.
✓ Branch 33 taken 6160 times.
✓ Branch 34 taken 4992 times.
✓ Branch 35 taken 12480 times.
✓ Branch 36 taken 144 times.
✓ Branch 37 taken 336 times.
✓ Branch 38 taken 6544 times.
✓ Branch 39 taken 10240 times.
✓ Branch 40 taken 8976 times.
✓ Branch 41 taken 16464 times.
|
250448 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
237 | 165040 | VectorOperationProviderType:: | |
238 | template process_one_element_with_vector_operation< | ||
239 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
240 | 165040 | SourceVecTraits::template svptrue_pat<SV_VL1>(), src_rows, | |
241 | 165040 | dst_rows, window_row_offsets, window_col_offsets, index, filter_); | |
242 | 165040 | } | |
243 | 85408 | } | |
244 | |||
245 | 4544 | void process_pixels_of_dual_rows_without_horizontal_borders( | |
246 | size_t width, Rows<const SourceType> src_rows, | ||
247 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets_0, | ||
248 | BorderOffsets window_row_offsets_1, | ||
249 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
250 | 4544 | LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; | |
251 | 8018 | loop.unroll_once([&](size_t index) KLEIDICV_STREAMING { | |
252 | 3474 | svbool_t pg = SourceVecTraits::svptrue(); | |
253 | 3474 | VectorOperationProviderType:: | |
254 | template process_two_elements_with_vector_operation< | ||
255 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
256 | 3474 | pg, src_rows, dst_rows, window_row_offsets_0, | |
257 | 3474 | window_row_offsets_1, window_col_offsets, index, filter_); | |
258 | 3474 | }); | |
259 | |||
260 | 6932 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
261 | 2388 | svbool_t pg = SourceVecTraits::svwhilelt(index, length); | |
262 | 2388 | VectorOperationProviderType:: | |
263 | template process_two_elements_with_vector_operation< | ||
264 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
265 | 2388 | pg, src_rows, dst_rows, window_row_offsets_0, | |
266 | 2388 | window_row_offsets_1, window_col_offsets, index, filter_); | |
267 | 2388 | }); | |
268 | 4544 | } | |
269 | |||
270 | // Processes two vertically adjacent pixels in a single column | ||
271 | 9088 | void process_two_pixels_with_horizontal_borders( | |
272 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
273 | BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, | ||
274 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
275 |
14/14✓ Branch 0 taken 256 times.
✓ Branch 1 taken 640 times.
✓ Branch 2 taken 2080 times.
✓ Branch 3 taken 2464 times.
✓ Branch 4 taken 2080 times.
✓ Branch 5 taken 2464 times.
✓ Branch 6 taken 2080 times.
✓ Branch 7 taken 2464 times.
✓ Branch 8 taken 256 times.
✓ Branch 9 taken 640 times.
✓ Branch 10 taken 256 times.
✓ Branch 11 taken 640 times.
✓ Branch 12 taken 2080 times.
✓ Branch 13 taken 2464 times.
|
20864 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
276 | 11776 | VectorOperationProviderType:: | |
277 | template process_two_elements_with_vector_operation< | ||
278 | InnerFilterType, SourceVectorType, DestinationVectorType>( | ||
279 | 11776 | SourceVecTraits::template svptrue_pat<SV_VL1>(), src_rows, | |
280 | 11776 | dst_rows, window_row_offsets_0, window_row_offsets_1, | |
281 | 11776 | window_col_offsets, index, filter_); | |
282 | 11776 | } | |
283 | 9088 | } | |
284 | |||
285 | private: | ||
286 | InnerFilterType filter_; | ||
287 | }; | ||
288 | |||
289 | // Shorthand for 3x3 2D filters driver type. | ||
290 | template <class InnerFilterType> | ||
291 | using Filter2D3x3 = Filter2d< | ||
292 | InnerFilterType, 3UL, | ||
293 | Filter2D3x3VectorOperations< | ||
294 | typename InnerFilterType::SourceType, | ||
295 | typename InnerFilterType::DestinationType, | ||
296 | Filter2dWindowLoader3x3<typename InnerFilterType::SourceType>>>; | ||
297 | |||
298 | template <typename InnerFilterType> | ||
299 | using Filter2D5x5 = Filter2d< | ||
300 | InnerFilterType, 5UL, | ||
301 | Filter2D5x5VectorOperations< | ||
302 | typename InnerFilterType::SourceType, | ||
303 | typename InnerFilterType::DestinationType, | ||
304 | Filter2dWindowLoader5x5<typename InnerFilterType::SourceType>>>; | ||
305 | |||
306 | template <typename InnerFilterType> | ||
307 | using Filter2D7x7 = Filter2d< | ||
308 | InnerFilterType, 7UL, | ||
309 | Filter2D7x7VectorOperations< | ||
310 | typename InnerFilterType::SourceType, | ||
311 | typename InnerFilterType::DestinationType, | ||
312 | Filter2dWindowLoader7x7<typename InnerFilterType::SourceType>>>; | ||
313 | |||
314 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
315 | |||
316 | #endif // KLEIDICV_FILTER_2D_SC_H | ||
317 |