Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
6 | #define KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
7 | |||
8 | #include <limits> | ||
9 | |||
10 | #include "kleidicv/filters/separable_filter_5x5_sc.h" | ||
11 | #include "kleidicv/kleidicv.h" | ||
12 | #include "kleidicv/sve2.h" | ||
13 | #include "kleidicv/workspace/separable.h" | ||
14 | |||
15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
16 | |||
17 | template <typename ScalarType, size_t KernelSize> | ||
18 | class SeparableFilter2D; | ||
19 | |||
20 | template <> | ||
21 | class SeparableFilter2D<uint8_t, 5> { | ||
22 | public: | ||
23 | using SourceType = uint8_t; | ||
24 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
25 | using BufferType = uint16_t; | ||
26 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
27 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
28 | using DestinationType = uint8_t; | ||
29 | |||
30 | 110 | SeparableFilter2D( | |
31 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_u16, | ||
32 | BufferVectorType &kernel_x_1_u16, BufferVectorType &kernel_x_2_u16, | ||
33 | BufferVectorType &kernel_x_3_u16, BufferVectorType &kernel_x_4_u16, | ||
34 | SourceVectorType &kernel_y_0_u8, SourceVectorType &kernel_y_1_u8, | ||
35 | SourceVectorType &kernel_y_2_u8, SourceVectorType &kernel_y_3_u8, | ||
36 | SourceVectorType &kernel_y_4_u8) | ||
37 | 110 | : kernel_x_(kernel_x), | |
38 | 110 | kernel_x_0_u16_(kernel_x_0_u16), | |
39 | 110 | kernel_x_1_u16_(kernel_x_1_u16), | |
40 | 110 | kernel_x_2_u16_(kernel_x_2_u16), | |
41 | 110 | kernel_x_3_u16_(kernel_x_3_u16), | |
42 | 110 | kernel_x_4_u16_(kernel_x_4_u16), | |
43 | |||
44 | 110 | kernel_y_0_u8_(kernel_y_0_u8), | |
45 | 110 | kernel_y_1_u8_(kernel_y_1_u8), | |
46 | 110 | kernel_y_2_u8_(kernel_y_2_u8), | |
47 | 110 | kernel_y_3_u8_(kernel_y_3_u8), | |
48 | 110 | kernel_y_4_u8_(kernel_y_4_u8) {} | |
49 | |||
50 | 1568 | void vertical_vector_path(svbool_t pg, | |
51 | std::reference_wrapper<SourceVectorType> src[5], | ||
52 | BufferType *dst) const KLEIDICV_STREAMING { | ||
53 | // 0 | ||
54 | 1568 | BufferVectorType acc_b = svmullb_u16(src[0], kernel_y_0_u8_); | |
55 | 1568 | BufferVectorType acc_t = svmullt_u16(src[0], kernel_y_0_u8_); | |
56 | |||
57 | // 1 | ||
58 | 1568 | BufferVectorType vec_b = svmullb_u16(src[1], kernel_y_1_u8_); | |
59 | 1568 | BufferVectorType vec_t = svmullt_u16(src[1], kernel_y_1_u8_); | |
60 | 1568 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
61 | 1568 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
62 | |||
63 | // 2 | ||
64 | 1568 | vec_b = svmullb_u16(src[2], kernel_y_2_u8_); | |
65 | 1568 | vec_t = svmullt_u16(src[2], kernel_y_2_u8_); | |
66 | 1568 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
67 | 1568 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
68 | |||
69 | // 3 | ||
70 | 1568 | vec_b = svmullb_u16(src[3], kernel_y_3_u8_); | |
71 | 1568 | vec_t = svmullt_u16(src[3], kernel_y_3_u8_); | |
72 | 1568 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
73 | 1568 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
74 | |||
75 | // 4 | ||
76 | 1568 | vec_b = svmullb_u16(src[4], kernel_y_4_u8_); | |
77 | 1568 | vec_t = svmullt_u16(src[4], kernel_y_4_u8_); | |
78 | 1568 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
79 | 1568 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
80 | |||
81 | 1568 | BufferDoubleVectorType interleaved = svcreate2_u16(acc_b, acc_t); | |
82 | 1568 | svst2(pg, &dst[0], interleaved); | |
83 | 1568 | } | |
84 | |||
85 | 1372 | void horizontal_vector_path(svbool_t pg, | |
86 | std::reference_wrapper<BufferVectorType> src[5], | ||
87 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
88 | // 0 | ||
89 | 1372 | svuint32_t acc_b = svmullb_u32(src[0], kernel_x_0_u16_); | |
90 | 1372 | svuint32_t acc_t = svmullt_u32(src[0], kernel_x_0_u16_); | |
91 | |||
92 | // 1 | ||
93 | 1372 | acc_b = svmlalb_u32(acc_b, src[1], kernel_x_1_u16_); | |
94 | 1372 | acc_t = svmlalt_u32(acc_t, src[1], kernel_x_1_u16_); | |
95 | |||
96 | // 2 | ||
97 | 1372 | acc_b = svmlalb_u32(acc_b, src[2], kernel_x_2_u16_); | |
98 | 1372 | acc_t = svmlalt_u32(acc_t, src[2], kernel_x_2_u16_); | |
99 | |||
100 | // 3 | ||
101 | 1372 | acc_b = svmlalb_u32(acc_b, src[3], kernel_x_3_u16_); | |
102 | 1372 | acc_t = svmlalt_u32(acc_t, src[3], kernel_x_3_u16_); | |
103 | |||
104 | // 4 | ||
105 | 1372 | acc_b = svmlalb_u32(acc_b, src[4], kernel_x_4_u16_); | |
106 | 1372 | acc_t = svmlalt_u32(acc_t, src[4], kernel_x_4_u16_); | |
107 | |||
108 | 1372 | svuint16_t acc_u16_b = svqxtnb_u32(acc_b); | |
109 | 1372 | svuint16_t acc_u16 = svqxtnt_u32(acc_u16_b, acc_t); | |
110 | |||
111 | 2744 | svbool_t greater = | |
112 | 1372 | svcmpgt_n_u16(pg, acc_u16, std::numeric_limits<SourceType>::max()); | |
113 | 1372 | acc_u16 = | |
114 | 1372 | svdup_n_u16_m(acc_u16, greater, std::numeric_limits<SourceType>::max()); | |
115 | |||
116 | 1372 | svst1b_u16(pg, &dst[0], acc_u16); | |
117 | 1372 | } | |
118 | |||
119 | 5728 | void horizontal_scalar_path(const BufferType src[5], | |
120 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
121 | 5728 | SourceType acc; // NOLINT | |
122 |
2/2✓ Branch 0 taken 4216 times.
✓ Branch 1 taken 1512 times.
|
5728 | if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { |
123 | 4216 | dst[0] = std::numeric_limits<SourceType>::max(); | |
124 | 4216 | return; | |
125 | } | ||
126 | |||
127 |
4/4✓ Branch 0 taken 5761 times.
✓ Branch 1 taken 989 times.
✓ Branch 2 taken 523 times.
✓ Branch 3 taken 989 times.
|
7273 | for (size_t i = 1; i < 5; i++) { |
128 | 5761 | SourceType temp; // NOLINT | |
129 |
2/2✓ Branch 0 taken 32 times.
✓ Branch 1 taken 5729 times.
|
5761 | if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { |
130 | 32 | dst[0] = std::numeric_limits<SourceType>::max(); | |
131 | 32 | return; | |
132 | } | ||
133 |
2/2✓ Branch 0 taken 491 times.
✓ Branch 1 taken 5238 times.
|
5729 | if (__builtin_add_overflow(acc, temp, &acc)) { |
134 | 491 | dst[0] = std::numeric_limits<SourceType>::max(); | |
135 | 491 | return; | |
136 | } | ||
137 | 5761 | } | |
138 | |||
139 | 989 | dst[0] = acc; | |
140 | 5728 | } | |
141 | |||
142 | private: | ||
143 | const SourceType *kernel_x_; | ||
144 | |||
145 | BufferVectorType &kernel_x_0_u16_; | ||
146 | BufferVectorType &kernel_x_1_u16_; | ||
147 | BufferVectorType &kernel_x_2_u16_; | ||
148 | BufferVectorType &kernel_x_3_u16_; | ||
149 | BufferVectorType &kernel_x_4_u16_; | ||
150 | |||
151 | SourceVectorType &kernel_y_0_u8_; | ||
152 | SourceVectorType &kernel_y_1_u8_; | ||
153 | SourceVectorType &kernel_y_2_u8_; | ||
154 | SourceVectorType &kernel_y_3_u8_; | ||
155 | SourceVectorType &kernel_y_4_u8_; | ||
156 | }; // end of class SeparableFilter2D<uint8_t, 5> | ||
157 | |||
158 | template <> | ||
159 | class SeparableFilter2D<uint16_t, 5> { | ||
160 | public: | ||
161 | using SourceType = uint16_t; | ||
162 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
163 | using BufferType = uint32_t; | ||
164 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
165 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
166 | using DestinationType = uint16_t; | ||
167 | |||
168 | 110 | SeparableFilter2D( | |
169 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_u32, | ||
170 | BufferVectorType &kernel_x_1_u32, BufferVectorType &kernel_x_2_u32, | ||
171 | BufferVectorType &kernel_x_3_u32, BufferVectorType &kernel_x_4_u32, | ||
172 | SourceVectorType &kernel_y_0_u16, SourceVectorType &kernel_y_1_u16, | ||
173 | SourceVectorType &kernel_y_2_u16, SourceVectorType &kernel_y_3_u16, | ||
174 | SourceVectorType &kernel_y_4_u16) | ||
175 | 110 | : kernel_x_(kernel_x), | |
176 | 110 | kernel_x_0_u32_(kernel_x_0_u32), | |
177 | 110 | kernel_x_1_u32_(kernel_x_1_u32), | |
178 | 110 | kernel_x_2_u32_(kernel_x_2_u32), | |
179 | 110 | kernel_x_3_u32_(kernel_x_3_u32), | |
180 | 110 | kernel_x_4_u32_(kernel_x_4_u32), | |
181 | |||
182 | 110 | kernel_y_0_u16_(kernel_y_0_u16), | |
183 | 110 | kernel_y_1_u16_(kernel_y_1_u16), | |
184 | 110 | kernel_y_2_u16_(kernel_y_2_u16), | |
185 | 110 | kernel_y_3_u16_(kernel_y_3_u16), | |
186 | 110 | kernel_y_4_u16_(kernel_y_4_u16) {} | |
187 | |||
188 | 2068 | void vertical_vector_path(svbool_t pg, | |
189 | std::reference_wrapper<SourceVectorType> src[5], | ||
190 | BufferType *dst) const KLEIDICV_STREAMING { | ||
191 | // 0 | ||
192 | 2068 | BufferVectorType acc_b = svmullb_u32(src[0], kernel_y_0_u16_); | |
193 | 2068 | BufferVectorType acc_t = svmullt_u32(src[0], kernel_y_0_u16_); | |
194 | |||
195 | // 1 | ||
196 | 2068 | BufferVectorType vec_b = svmullb_u32(src[1], kernel_y_1_u16_); | |
197 | 2068 | BufferVectorType vec_t = svmullt_u32(src[1], kernel_y_1_u16_); | |
198 | 2068 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
199 | 2068 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
200 | |||
201 | // 2 | ||
202 | 2068 | vec_b = svmullb_u32(src[2], kernel_y_2_u16_); | |
203 | 2068 | vec_t = svmullt_u32(src[2], kernel_y_2_u16_); | |
204 | 2068 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
205 | 2068 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
206 | |||
207 | // 3 | ||
208 | 2068 | vec_b = svmullb_u32(src[3], kernel_y_3_u16_); | |
209 | 2068 | vec_t = svmullt_u32(src[3], kernel_y_3_u16_); | |
210 | 2068 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
211 | 2068 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
212 | |||
213 | // 4 | ||
214 | 2068 | vec_b = svmullb_u32(src[4], kernel_y_4_u16_); | |
215 | 2068 | vec_t = svmullt_u32(src[4], kernel_y_4_u16_); | |
216 | 2068 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
217 | 2068 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
218 | |||
219 | 2068 | BufferDoubleVectorType interleaved = svcreate2_u32(acc_b, acc_t); | |
220 | 2068 | svst2(pg, &dst[0], interleaved); | |
221 | 2068 | } | |
222 | |||
223 | 2083 | void horizontal_vector_path(svbool_t pg, | |
224 | std::reference_wrapper<BufferVectorType> src[5], | ||
225 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
226 | // 0 | ||
227 | 2083 | svuint64_t acc_b = svmullb_u64(src[0], kernel_x_0_u32_); | |
228 | 2083 | svuint64_t acc_t = svmullt_u64(src[0], kernel_x_0_u32_); | |
229 | |||
230 | // 1 | ||
231 | 2083 | acc_b = svmlalb_u64(acc_b, src[1], kernel_x_1_u32_); | |
232 | 2083 | acc_t = svmlalt_u64(acc_t, src[1], kernel_x_1_u32_); | |
233 | |||
234 | // 2 | ||
235 | 2083 | acc_b = svmlalb_u64(acc_b, src[2], kernel_x_2_u32_); | |
236 | 2083 | acc_t = svmlalt_u64(acc_t, src[2], kernel_x_2_u32_); | |
237 | |||
238 | // 3 | ||
239 | 2083 | acc_b = svmlalb_u64(acc_b, src[3], kernel_x_3_u32_); | |
240 | 2083 | acc_t = svmlalt_u64(acc_t, src[3], kernel_x_3_u32_); | |
241 | |||
242 | // 4 | ||
243 | 2083 | acc_b = svmlalb_u64(acc_b, src[4], kernel_x_4_u32_); | |
244 | 2083 | acc_t = svmlalt_u64(acc_t, src[4], kernel_x_4_u32_); | |
245 | |||
246 | 2083 | svuint32_t acc_u32_b = svqxtnb_u64(acc_b); | |
247 | 2083 | svuint32_t acc_u32 = svqxtnt_u64(acc_u32_b, acc_t); | |
248 | |||
249 | 4166 | svbool_t greater = | |
250 | 2083 | svcmpgt_n_u32(pg, acc_u32, std::numeric_limits<SourceType>::max()); | |
251 | 2083 | acc_u32 = | |
252 | 2083 | svdup_n_u32_m(acc_u32, greater, std::numeric_limits<SourceType>::max()); | |
253 | |||
254 | 2083 | svst1h_u32(pg, &dst[0], acc_u32); | |
255 | 2083 | } | |
256 | |||
257 | 5800 | void horizontal_scalar_path(const BufferType src[5], | |
258 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
259 | 5800 | SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT | |
260 |
2/2✓ Branch 0 taken 4260 times.
✓ Branch 1 taken 1540 times.
|
5800 | if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { |
261 | 4260 | dst[0] = std::numeric_limits<SourceType>::max(); | |
262 | 4260 | return; | |
263 | } | ||
264 | |||
265 |
4/4✓ Branch 0 taken 6144 times.
✓ Branch 1 taken 1502 times.
✓ Branch 2 taken 38 times.
✓ Branch 3 taken 1502 times.
|
7684 | for (size_t i = 1; i < 5; i++) { |
266 | 6144 | SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT | |
267 |
2/2✓ Branch 0 taken 4 times.
✓ Branch 1 taken 6140 times.
|
6144 | if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { |
268 | 4 | dst[0] = std::numeric_limits<SourceType>::max(); | |
269 | 4 | return; | |
270 | } | ||
271 |
2/2✓ Branch 0 taken 34 times.
✓ Branch 1 taken 6106 times.
|
6140 | if (__builtin_add_overflow(acc, temp, &acc)) { |
272 | 34 | dst[0] = std::numeric_limits<SourceType>::max(); | |
273 | 34 | return; | |
274 | } | ||
275 | 6144 | } | |
276 | |||
277 | 1502 | dst[0] = acc; | |
278 | 5800 | } | |
279 | |||
280 | private: | ||
281 | const SourceType *kernel_x_; | ||
282 | |||
283 | BufferVectorType &kernel_x_0_u32_; | ||
284 | BufferVectorType &kernel_x_1_u32_; | ||
285 | BufferVectorType &kernel_x_2_u32_; | ||
286 | BufferVectorType &kernel_x_3_u32_; | ||
287 | BufferVectorType &kernel_x_4_u32_; | ||
288 | |||
289 | SourceVectorType &kernel_y_0_u16_; | ||
290 | SourceVectorType &kernel_y_1_u16_; | ||
291 | SourceVectorType &kernel_y_2_u16_; | ||
292 | SourceVectorType &kernel_y_3_u16_; | ||
293 | SourceVectorType &kernel_y_4_u16_; | ||
294 | }; // end of class SeparableFilter2D<uint16_t, 5> | ||
295 | |||
296 | template <> | ||
297 | class SeparableFilter2D<int16_t, 5> { | ||
298 | public: | ||
299 | using SourceType = int16_t; | ||
300 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
301 | using BufferType = int32_t; | ||
302 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
303 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
304 | using DestinationType = int16_t; | ||
305 | |||
306 | 108 | SeparableFilter2D( | |
307 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_s32, | ||
308 | BufferVectorType &kernel_x_1_s32, BufferVectorType &kernel_x_2_s32, | ||
309 | BufferVectorType &kernel_x_3_s32, BufferVectorType &kernel_x_4_s32, | ||
310 | SourceVectorType &kernel_y_0_s16, SourceVectorType &kernel_y_1_s16, | ||
311 | SourceVectorType &kernel_y_2_s16, SourceVectorType &kernel_y_3_s16, | ||
312 | SourceVectorType &kernel_y_4_s16) | ||
313 | 108 | : kernel_x_(kernel_x), | |
314 | 108 | kernel_x_0_s32_(kernel_x_0_s32), | |
315 | 108 | kernel_x_1_s32_(kernel_x_1_s32), | |
316 | 108 | kernel_x_2_s32_(kernel_x_2_s32), | |
317 | 108 | kernel_x_3_s32_(kernel_x_3_s32), | |
318 | 108 | kernel_x_4_s32_(kernel_x_4_s32), | |
319 | |||
320 | 108 | kernel_y_0_s16_(kernel_y_0_s16), | |
321 | 108 | kernel_y_1_s16_(kernel_y_1_s16), | |
322 | 108 | kernel_y_2_s16_(kernel_y_2_s16), | |
323 | 108 | kernel_y_3_s16_(kernel_y_3_s16), | |
324 | 108 | kernel_y_4_s16_(kernel_y_4_s16) {} | |
325 | |||
326 | 2060 | void vertical_vector_path(svbool_t pg, | |
327 | std::reference_wrapper<SourceVectorType> src[5], | ||
328 | BufferType *dst) const KLEIDICV_STREAMING { | ||
329 | // 0 | ||
330 | 2060 | BufferVectorType acc_b = svmullb_s32(src[0], kernel_y_0_s16_); | |
331 | 2060 | BufferVectorType acc_t = svmullt_s32(src[0], kernel_y_0_s16_); | |
332 | |||
333 | // 1 | ||
334 | 2060 | BufferVectorType vec_b = svmullb_s32(src[1], kernel_y_1_s16_); | |
335 | 2060 | BufferVectorType vec_t = svmullt_s32(src[1], kernel_y_1_s16_); | |
336 | 2060 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
337 | 2060 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
338 | |||
339 | // 2 | ||
340 | 2060 | vec_b = svmullb_s32(src[2], kernel_y_2_s16_); | |
341 | 2060 | vec_t = svmullt_s32(src[2], kernel_y_2_s16_); | |
342 | 2060 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
343 | 2060 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
344 | |||
345 | // 3 | ||
346 | 2060 | vec_b = svmullb_s32(src[3], kernel_y_3_s16_); | |
347 | 2060 | vec_t = svmullt_s32(src[3], kernel_y_3_s16_); | |
348 | 2060 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
349 | 2060 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
350 | |||
351 | // 4 | ||
352 | 2060 | vec_b = svmullb_s32(src[4], kernel_y_4_s16_); | |
353 | 2060 | vec_t = svmullt_s32(src[4], kernel_y_4_s16_); | |
354 | 2060 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
355 | 2060 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
356 | |||
357 | 2060 | BufferDoubleVectorType interleaved = svcreate2_s32(acc_b, acc_t); | |
358 | 2060 | svst2(pg, &dst[0], interleaved); | |
359 | 2060 | } | |
360 | |||
361 | 2084 | void horizontal_vector_path(svbool_t pg, | |
362 | std::reference_wrapper<BufferVectorType> src[5], | ||
363 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
364 | // 0 | ||
365 | 2084 | svint64_t acc_b = svmullb_s64(src[0], kernel_x_0_s32_); | |
366 | 2084 | svint64_t acc_t = svmullt_s64(src[0], kernel_x_0_s32_); | |
367 | |||
368 | // 1 | ||
369 | 2084 | acc_b = svmlalb_s64(acc_b, src[1], kernel_x_1_s32_); | |
370 | 2084 | acc_t = svmlalt_s64(acc_t, src[1], kernel_x_1_s32_); | |
371 | |||
372 | // 2 | ||
373 | 2084 | acc_b = svmlalb_s64(acc_b, src[2], kernel_x_2_s32_); | |
374 | 2084 | acc_t = svmlalt_s64(acc_t, src[2], kernel_x_2_s32_); | |
375 | |||
376 | // 3 | ||
377 | 2084 | acc_b = svmlalb_s64(acc_b, src[3], kernel_x_3_s32_); | |
378 | 2084 | acc_t = svmlalt_s64(acc_t, src[3], kernel_x_3_s32_); | |
379 | |||
380 | // 4 | ||
381 | 2084 | acc_b = svmlalb_s64(acc_b, src[4], kernel_x_4_s32_); | |
382 | 2084 | acc_t = svmlalt_s64(acc_t, src[4], kernel_x_4_s32_); | |
383 | |||
384 | 2084 | svint32_t acc_s32_b = svqxtnb_s64(acc_b); | |
385 | 2084 | svint32_t acc_s32 = svqxtnt_s64(acc_s32_b, acc_t); | |
386 | |||
387 | 4168 | svbool_t less = | |
388 | 2084 | svcmplt_n_s32(pg, acc_s32, std::numeric_limits<SourceType>::min()); | |
389 | 2084 | acc_s32 = | |
390 | 2084 | svdup_n_s32_m(acc_s32, less, std::numeric_limits<SourceType>::min()); | |
391 | |||
392 | 4168 | svbool_t greater = | |
393 | 2084 | svcmpgt_n_s32(pg, acc_s32, std::numeric_limits<SourceType>::max()); | |
394 | 2084 | acc_s32 = | |
395 | 2084 | svdup_n_s32_m(acc_s32, greater, std::numeric_limits<SourceType>::max()); | |
396 | |||
397 | 2084 | svst1h_s32(pg, &dst[0], acc_s32); | |
398 | 2084 | } | |
399 | |||
400 | 5760 | void horizontal_scalar_path(const BufferType src[5], | |
401 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
402 | 5760 | int64_t acc = static_cast<int64_t>(src[0]) * kernel_x_[0]; | |
403 |
2/2✓ Branch 0 taken 23040 times.
✓ Branch 1 taken 5760 times.
|
28800 | for (size_t i = 1; i < 5; i++) { |
404 | 23040 | acc += static_cast<int64_t>(src[i]) * kernel_x_[i]; | |
405 | 23040 | } | |
406 | |||
407 |
2/2✓ Branch 0 taken 2198 times.
✓ Branch 1 taken 3562 times.
|
5760 | if (acc < std::numeric_limits<DestinationType>::min()) { |
408 | 2198 | acc = std::numeric_limits<DestinationType>::min(); | |
409 |
2/2✓ Branch 0 taken 1588 times.
✓ Branch 1 taken 1974 times.
|
5760 | } else if (acc > std::numeric_limits<DestinationType>::max()) { |
410 | 1974 | acc = std::numeric_limits<DestinationType>::max(); | |
411 | 1974 | } | |
412 | |||
413 | 5760 | dst[0] = static_cast<DestinationType>(acc); | |
414 | 5760 | } | |
415 | |||
416 | private: | ||
417 | const SourceType *kernel_x_; | ||
418 | |||
419 | BufferVectorType &kernel_x_0_s32_; | ||
420 | BufferVectorType &kernel_x_1_s32_; | ||
421 | BufferVectorType &kernel_x_2_s32_; | ||
422 | BufferVectorType &kernel_x_3_s32_; | ||
423 | BufferVectorType &kernel_x_4_s32_; | ||
424 | |||
425 | SourceVectorType &kernel_y_0_s16_; | ||
426 | SourceVectorType &kernel_y_1_s16_; | ||
427 | SourceVectorType &kernel_y_2_s16_; | ||
428 | SourceVectorType &kernel_y_3_s16_; | ||
429 | SourceVectorType &kernel_y_4_s16_; | ||
430 | }; // end of class SeparableFilter2D<int16_t, 5> | ||
431 | |||
432 | template <typename T> | ||
433 | 400 | static kleidicv_error_t separable_filter_2d_checks( | |
434 | const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, | ||
435 | size_t height, size_t channels, const T *kernel_x, const T *kernel_y, | ||
436 | SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
437 |
6/6✓ Branch 0 taken 6 times.
✓ Branch 1 taken 128 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 128 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 126 times.
|
400 | CHECK_POINTERS(workspace, kernel_x, kernel_y); |
438 | |||
439 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 126 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 126 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 126 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 126 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 124 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 124 times.
|
382 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
440 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 124 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 124 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 124 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 124 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 122 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 122 times.
|
376 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
441 |
18/18✓ Branch 0 taken 2 times.
✓ Branch 1 taken 122 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 120 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 120 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 122 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 120 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 120 times.
✓ Branch 12 taken 2 times.
✓ Branch 13 taken 120 times.
✓ Branch 14 taken 2 times.
✓ Branch 15 taken 118 times.
✓ Branch 16 taken 4 times.
✓ Branch 17 taken 118 times.
|
370 | CHECK_IMAGE_SIZE(width, height); |
442 | |||
443 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 118 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 118 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 116 times.
|
358 | if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) { |
444 | 6 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
445 | } | ||
446 | |||
447 |
6/6✓ Branch 0 taken 2 times.
✓ Branch 1 taken 116 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 116 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 114 times.
|
352 | if (workspace->channels() < channels) { |
448 | 6 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
449 | } | ||
450 | |||
451 | 346 | const Rectangle &context_rect = workspace->image_size(); | |
452 |
12/12✓ Branch 0 taken 112 times.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 110 times.
✓ Branch 4 taken 112 times.
✓ Branch 5 taken 4 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 110 times.
✓ Branch 8 taken 110 times.
✓ Branch 9 taken 4 times.
✓ Branch 10 taken 2 times.
✓ Branch 11 taken 108 times.
|
346 | if (context_rect.width() < width || context_rect.height() < height) { |
453 | 18 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
454 | } | ||
455 | |||
456 | 328 | return KLEIDICV_OK; | |
457 | 400 | } | |
458 | |||
459 | template <typename T> | ||
460 | 400 | kleidicv_error_t separable_filter_2d_stripe_sc( | |
461 | const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, | ||
462 | size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
463 | const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y, | ||
464 | size_t /*kernel_height*/, FixedBorderType fixed_border_type, | ||
465 | kleidicv_filter_context_t *context) KLEIDICV_STREAMING { | ||
466 | 400 | auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context); | |
467 | 800 | kleidicv_error_t checks_result = separable_filter_2d_checks( | |
468 | 400 | src, src_stride, dst, dst_stride, width, height, channels, kernel_x, | |
469 | 400 | kernel_y, workspace); | |
470 | |||
471 |
6/6✓ Branch 0 taken 24 times.
✓ Branch 1 taken 110 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 110 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 108 times.
|
400 | if (checks_result != KLEIDICV_OK) { |
472 | 72 | return checks_result; | |
473 | } | ||
474 | |||
475 | 328 | Rectangle rect{width, height}; | |
476 | |||
477 | using SeparableFilterClass = SeparableFilter2D<T, 5>; | ||
478 | |||
479 | using WiderT = typename double_element_width<T>::type; | ||
480 | using KernelXVectorTraits = VecTraits<WiderT>; | ||
481 | using KernelXVectorT = typename KernelXVectorTraits::VectorType; | ||
482 | using KernelYVectorTraits = VecTraits<T>; | ||
483 | using KernelYVectorT = typename KernelYVectorTraits::VectorType; | ||
484 | |||
485 | 328 | KernelXVectorT kernel_x_0 = KernelXVectorTraits::svdup(kernel_x[0]); | |
486 | 328 | KernelXVectorT kernel_x_1 = KernelXVectorTraits::svdup(kernel_x[1]); | |
487 | 328 | KernelXVectorT kernel_x_2 = KernelXVectorTraits::svdup(kernel_x[2]); | |
488 | 328 | KernelXVectorT kernel_x_3 = KernelXVectorTraits::svdup(kernel_x[3]); | |
489 | 328 | KernelXVectorT kernel_x_4 = KernelXVectorTraits::svdup(kernel_x[4]); | |
490 | |||
491 | 328 | KernelYVectorT kernel_y_0 = KernelYVectorTraits::svdup(kernel_y[0]); | |
492 | 328 | KernelYVectorT kernel_y_1 = KernelYVectorTraits::svdup(kernel_y[1]); | |
493 | 328 | KernelYVectorT kernel_y_2 = KernelYVectorTraits::svdup(kernel_y[2]); | |
494 | 328 | KernelYVectorT kernel_y_3 = KernelYVectorTraits::svdup(kernel_y[3]); | |
495 | 328 | KernelYVectorT kernel_y_4 = KernelYVectorTraits::svdup(kernel_y[4]); | |
496 | |||
497 | 656 | SeparableFilterClass filterClass{ | |
498 | 328 | kernel_x, kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4, | |
499 | kernel_y_0, kernel_y_1, kernel_y_2, kernel_y_3, kernel_y_4}; | ||
500 | 328 | SeparableFilter<SeparableFilterClass, 5> filter{filterClass}; | |
501 | |||
502 | 328 | Rows<const T> src_rows{src, src_stride, channels}; | |
503 | 328 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
504 | 656 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
505 | 328 | fixed_border_type, filter); | |
506 | |||
507 | 328 | return KLEIDICV_OK; | |
508 | 400 | } | |
509 | |||
510 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
511 | |||
512 | #endif // KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
513 |