| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
| 6 | #define KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
| 7 | |||
| 8 | #include <limits> | ||
| 9 | |||
| 10 | #include "kleidicv/filters/separable_filter_5x5_sc.h" | ||
| 11 | #include "kleidicv/kleidicv.h" | ||
| 12 | #include "kleidicv/sve2.h" | ||
| 13 | #include "kleidicv/workspace/separable.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | |||
| 17 | template <typename ScalarType, size_t KernelSize> | ||
| 18 | class SeparableFilter2D; | ||
| 19 | |||
| 20 | template <> | ||
| 21 | class SeparableFilter2D<uint8_t, 5> { | ||
| 22 | public: | ||
| 23 | using SourceType = uint8_t; | ||
| 24 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
| 25 | using BufferType = uint16_t; | ||
| 26 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
| 27 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
| 28 | using DestinationType = uint8_t; | ||
| 29 | |||
| 30 | 165 | SeparableFilter2D( | |
| 31 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_u16, | ||
| 32 | BufferVectorType &kernel_x_1_u16, BufferVectorType &kernel_x_2_u16, | ||
| 33 | BufferVectorType &kernel_x_3_u16, BufferVectorType &kernel_x_4_u16, | ||
| 34 | SourceVectorType &kernel_y_0_u8, SourceVectorType &kernel_y_1_u8, | ||
| 35 | SourceVectorType &kernel_y_2_u8, SourceVectorType &kernel_y_3_u8, | ||
| 36 | SourceVectorType &kernel_y_4_u8) | ||
| 37 | 165 | : kernel_x_(kernel_x), | |
| 38 | 165 | kernel_x_0_u16_(kernel_x_0_u16), | |
| 39 | 165 | kernel_x_1_u16_(kernel_x_1_u16), | |
| 40 | 165 | kernel_x_2_u16_(kernel_x_2_u16), | |
| 41 | 165 | kernel_x_3_u16_(kernel_x_3_u16), | |
| 42 | 165 | kernel_x_4_u16_(kernel_x_4_u16), | |
| 43 | |||
| 44 | 165 | kernel_y_0_u8_(kernel_y_0_u8), | |
| 45 | 165 | kernel_y_1_u8_(kernel_y_1_u8), | |
| 46 | 165 | kernel_y_2_u8_(kernel_y_2_u8), | |
| 47 | 165 | kernel_y_3_u8_(kernel_y_3_u8), | |
| 48 | 165 | kernel_y_4_u8_(kernel_y_4_u8) {} | |
| 49 | |||
| 50 | 2284 | void vertical_vector_path(svbool_t pg, | |
| 51 | std::reference_wrapper<SourceVectorType> src[5], | ||
| 52 | BufferType *dst) const KLEIDICV_STREAMING { | ||
| 53 | // 0 | ||
| 54 | 2284 | BufferVectorType acc_b = svmullb_u16(src[0], kernel_y_0_u8_); | |
| 55 | 2284 | BufferVectorType acc_t = svmullt_u16(src[0], kernel_y_0_u8_); | |
| 56 | |||
| 57 | // 1 | ||
| 58 | 2284 | BufferVectorType vec_b = svmullb_u16(src[1], kernel_y_1_u8_); | |
| 59 | 2284 | BufferVectorType vec_t = svmullt_u16(src[1], kernel_y_1_u8_); | |
| 60 | 2284 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
| 61 | 2284 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
| 62 | |||
| 63 | // 2 | ||
| 64 | 2284 | vec_b = svmullb_u16(src[2], kernel_y_2_u8_); | |
| 65 | 2284 | vec_t = svmullt_u16(src[2], kernel_y_2_u8_); | |
| 66 | 2284 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
| 67 | 2284 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
| 68 | |||
| 69 | // 3 | ||
| 70 | 2284 | vec_b = svmullb_u16(src[3], kernel_y_3_u8_); | |
| 71 | 2284 | vec_t = svmullt_u16(src[3], kernel_y_3_u8_); | |
| 72 | 2284 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
| 73 | 2284 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
| 74 | |||
| 75 | // 4 | ||
| 76 | 2284 | vec_b = svmullb_u16(src[4], kernel_y_4_u8_); | |
| 77 | 2284 | vec_t = svmullt_u16(src[4], kernel_y_4_u8_); | |
| 78 | 2284 | acc_b = svqadd_u16_x(pg, acc_b, vec_b); | |
| 79 | 2284 | acc_t = svqadd_u16_x(pg, acc_t, vec_t); | |
| 80 | |||
| 81 | 2284 | BufferDoubleVectorType interleaved = svcreate2_u16(acc_b, acc_t); | |
| 82 | 2284 | svst2(pg, &dst[0], interleaved); | |
| 83 | 2284 | } | |
| 84 | |||
| 85 | 1950 | void horizontal_vector_path(svbool_t pg, | |
| 86 | std::reference_wrapper<BufferVectorType> src[5], | ||
| 87 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 88 | // 0 | ||
| 89 | 1950 | svuint32_t acc_b = svmullb_u32(src[0], kernel_x_0_u16_); | |
| 90 | 1950 | svuint32_t acc_t = svmullt_u32(src[0], kernel_x_0_u16_); | |
| 91 | |||
| 92 | // 1 | ||
| 93 | 1950 | acc_b = svmlalb_u32(acc_b, src[1], kernel_x_1_u16_); | |
| 94 | 1950 | acc_t = svmlalt_u32(acc_t, src[1], kernel_x_1_u16_); | |
| 95 | |||
| 96 | // 2 | ||
| 97 | 1950 | acc_b = svmlalb_u32(acc_b, src[2], kernel_x_2_u16_); | |
| 98 | 1950 | acc_t = svmlalt_u32(acc_t, src[2], kernel_x_2_u16_); | |
| 99 | |||
| 100 | // 3 | ||
| 101 | 1950 | acc_b = svmlalb_u32(acc_b, src[3], kernel_x_3_u16_); | |
| 102 | 1950 | acc_t = svmlalt_u32(acc_t, src[3], kernel_x_3_u16_); | |
| 103 | |||
| 104 | // 4 | ||
| 105 | 1950 | acc_b = svmlalb_u32(acc_b, src[4], kernel_x_4_u16_); | |
| 106 | 1950 | acc_t = svmlalt_u32(acc_t, src[4], kernel_x_4_u16_); | |
| 107 | |||
| 108 | 1950 | svuint16_t acc_u16_b = svqxtnb_u32(acc_b); | |
| 109 | 1950 | svuint16_t acc_u16 = svqxtnt_u32(acc_u16_b, acc_t); | |
| 110 | |||
| 111 | 3900 | svbool_t greater = | |
| 112 | 1950 | svcmpgt_n_u16(pg, acc_u16, std::numeric_limits<SourceType>::max()); | |
| 113 | 1950 | acc_u16 = | |
| 114 | 1950 | svdup_n_u16_m(acc_u16, greater, std::numeric_limits<SourceType>::max()); | |
| 115 | |||
| 116 | 1950 | svst1b_u16(pg, &dst[0], acc_u16); | |
| 117 | 1950 | } | |
| 118 | |||
| 119 | 8592 | void horizontal_scalar_path(const BufferType src[5], | |
| 120 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 121 | 8592 | SourceType acc; // NOLINT | |
| 122 |
2/2✓ Branch 0 taken 6324 times.
✓ Branch 1 taken 2268 times.
|
8592 | if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { |
| 123 | 6324 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 124 | 6324 | return; | |
| 125 | } | ||
| 126 | |||
| 127 |
4/4✓ Branch 0 taken 8672 times.
✓ Branch 1 taken 1573 times.
✓ Branch 2 taken 695 times.
✓ Branch 3 taken 1573 times.
|
10940 | for (size_t i = 1; i < 5; i++) { |
| 128 | 8672 | SourceType temp; // NOLINT | |
| 129 |
2/2✓ Branch 0 taken 48 times.
✓ Branch 1 taken 8624 times.
|
8672 | if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { |
| 130 | 48 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 131 | 48 | return; | |
| 132 | } | ||
| 133 |
2/2✓ Branch 0 taken 647 times.
✓ Branch 1 taken 7977 times.
|
8624 | if (__builtin_add_overflow(acc, temp, &acc)) { |
| 134 | 647 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 135 | 647 | return; | |
| 136 | } | ||
| 137 | 8672 | } | |
| 138 | |||
| 139 | 1573 | dst[0] = acc; | |
| 140 | 8592 | } | |
| 141 | |||
| 142 | private: | ||
| 143 | const SourceType *kernel_x_; | ||
| 144 | |||
| 145 | BufferVectorType &kernel_x_0_u16_; | ||
| 146 | BufferVectorType &kernel_x_1_u16_; | ||
| 147 | BufferVectorType &kernel_x_2_u16_; | ||
| 148 | BufferVectorType &kernel_x_3_u16_; | ||
| 149 | BufferVectorType &kernel_x_4_u16_; | ||
| 150 | |||
| 151 | SourceVectorType &kernel_y_0_u8_; | ||
| 152 | SourceVectorType &kernel_y_1_u8_; | ||
| 153 | SourceVectorType &kernel_y_2_u8_; | ||
| 154 | SourceVectorType &kernel_y_3_u8_; | ||
| 155 | SourceVectorType &kernel_y_4_u8_; | ||
| 156 | }; // end of class SeparableFilter2D<uint8_t, 5> | ||
| 157 | |||
| 158 | template <> | ||
| 159 | class SeparableFilter2D<uint16_t, 5> { | ||
| 160 | public: | ||
| 161 | using SourceType = uint16_t; | ||
| 162 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
| 163 | using BufferType = uint32_t; | ||
| 164 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
| 165 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
| 166 | using DestinationType = uint16_t; | ||
| 167 | |||
| 168 | 165 | SeparableFilter2D( | |
| 169 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_u32, | ||
| 170 | BufferVectorType &kernel_x_1_u32, BufferVectorType &kernel_x_2_u32, | ||
| 171 | BufferVectorType &kernel_x_3_u32, BufferVectorType &kernel_x_4_u32, | ||
| 172 | SourceVectorType &kernel_y_0_u16, SourceVectorType &kernel_y_1_u16, | ||
| 173 | SourceVectorType &kernel_y_2_u16, SourceVectorType &kernel_y_3_u16, | ||
| 174 | SourceVectorType &kernel_y_4_u16) | ||
| 175 | 165 | : kernel_x_(kernel_x), | |
| 176 | 165 | kernel_x_0_u32_(kernel_x_0_u32), | |
| 177 | 165 | kernel_x_1_u32_(kernel_x_1_u32), | |
| 178 | 165 | kernel_x_2_u32_(kernel_x_2_u32), | |
| 179 | 165 | kernel_x_3_u32_(kernel_x_3_u32), | |
| 180 | 165 | kernel_x_4_u32_(kernel_x_4_u32), | |
| 181 | |||
| 182 | 165 | kernel_y_0_u16_(kernel_y_0_u16), | |
| 183 | 165 | kernel_y_1_u16_(kernel_y_1_u16), | |
| 184 | 165 | kernel_y_2_u16_(kernel_y_2_u16), | |
| 185 | 165 | kernel_y_3_u16_(kernel_y_3_u16), | |
| 186 | 165 | kernel_y_4_u16_(kernel_y_4_u16) {} | |
| 187 | |||
| 188 | 2909 | void vertical_vector_path(svbool_t pg, | |
| 189 | std::reference_wrapper<SourceVectorType> src[5], | ||
| 190 | BufferType *dst) const KLEIDICV_STREAMING { | ||
| 191 | // 0 | ||
| 192 | 2909 | BufferVectorType acc_b = svmullb_u32(src[0], kernel_y_0_u16_); | |
| 193 | 2909 | BufferVectorType acc_t = svmullt_u32(src[0], kernel_y_0_u16_); | |
| 194 | |||
| 195 | // 1 | ||
| 196 | 2909 | BufferVectorType vec_b = svmullb_u32(src[1], kernel_y_1_u16_); | |
| 197 | 2909 | BufferVectorType vec_t = svmullt_u32(src[1], kernel_y_1_u16_); | |
| 198 | 2909 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
| 199 | 2909 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
| 200 | |||
| 201 | // 2 | ||
| 202 | 2909 | vec_b = svmullb_u32(src[2], kernel_y_2_u16_); | |
| 203 | 2909 | vec_t = svmullt_u32(src[2], kernel_y_2_u16_); | |
| 204 | 2909 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
| 205 | 2909 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
| 206 | |||
| 207 | // 3 | ||
| 208 | 2909 | vec_b = svmullb_u32(src[3], kernel_y_3_u16_); | |
| 209 | 2909 | vec_t = svmullt_u32(src[3], kernel_y_3_u16_); | |
| 210 | 2909 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
| 211 | 2909 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
| 212 | |||
| 213 | // 4 | ||
| 214 | 2909 | vec_b = svmullb_u32(src[4], kernel_y_4_u16_); | |
| 215 | 2909 | vec_t = svmullt_u32(src[4], kernel_y_4_u16_); | |
| 216 | 2909 | acc_b = svqadd_u32_x(pg, acc_b, vec_b); | |
| 217 | 2909 | acc_t = svqadd_u32_x(pg, acc_t, vec_t); | |
| 218 | |||
| 219 | 2909 | BufferDoubleVectorType interleaved = svcreate2_u32(acc_b, acc_t); | |
| 220 | 2909 | svst2(pg, &dst[0], interleaved); | |
| 221 | 2909 | } | |
| 222 | |||
| 223 | 2834 | void horizontal_vector_path(svbool_t pg, | |
| 224 | std::reference_wrapper<BufferVectorType> src[5], | ||
| 225 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 226 | // 0 | ||
| 227 | 2834 | svuint64_t acc_b = svmullb_u64(src[0], kernel_x_0_u32_); | |
| 228 | 2834 | svuint64_t acc_t = svmullt_u64(src[0], kernel_x_0_u32_); | |
| 229 | |||
| 230 | // 1 | ||
| 231 | 2834 | acc_b = svmlalb_u64(acc_b, src[1], kernel_x_1_u32_); | |
| 232 | 2834 | acc_t = svmlalt_u64(acc_t, src[1], kernel_x_1_u32_); | |
| 233 | |||
| 234 | // 2 | ||
| 235 | 2834 | acc_b = svmlalb_u64(acc_b, src[2], kernel_x_2_u32_); | |
| 236 | 2834 | acc_t = svmlalt_u64(acc_t, src[2], kernel_x_2_u32_); | |
| 237 | |||
| 238 | // 3 | ||
| 239 | 2834 | acc_b = svmlalb_u64(acc_b, src[3], kernel_x_3_u32_); | |
| 240 | 2834 | acc_t = svmlalt_u64(acc_t, src[3], kernel_x_3_u32_); | |
| 241 | |||
| 242 | // 4 | ||
| 243 | 2834 | acc_b = svmlalb_u64(acc_b, src[4], kernel_x_4_u32_); | |
| 244 | 2834 | acc_t = svmlalt_u64(acc_t, src[4], kernel_x_4_u32_); | |
| 245 | |||
| 246 | 2834 | svuint32_t acc_u32_b = svqxtnb_u64(acc_b); | |
| 247 | 2834 | svuint32_t acc_u32 = svqxtnt_u64(acc_u32_b, acc_t); | |
| 248 | |||
| 249 | 5668 | svbool_t greater = | |
| 250 | 2834 | svcmpgt_n_u32(pg, acc_u32, std::numeric_limits<SourceType>::max()); | |
| 251 | 2834 | acc_u32 = | |
| 252 | 2834 | svdup_n_u32_m(acc_u32, greater, std::numeric_limits<SourceType>::max()); | |
| 253 | |||
| 254 | 2834 | svst1h_u32(pg, &dst[0], acc_u32); | |
| 255 | 2834 | } | |
| 256 | |||
| 257 | 8700 | void horizontal_scalar_path(const BufferType src[5], | |
| 258 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 259 | 8700 | SourceType acc; // Avoid cppcoreguidelines-init-variables. NOLINT | |
| 260 |
2/2✓ Branch 0 taken 6390 times.
✓ Branch 1 taken 2310 times.
|
8700 | if (__builtin_mul_overflow(src[0], kernel_x_[0], &acc)) { |
| 261 | 6390 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 262 | 6390 | return; | |
| 263 | } | ||
| 264 | |||
| 265 |
4/4✓ Branch 0 taken 9216 times.
✓ Branch 1 taken 2253 times.
✓ Branch 2 taken 57 times.
✓ Branch 3 taken 2253 times.
|
11526 | for (size_t i = 1; i < 5; i++) { |
| 266 | 9216 | SourceType temp; // Avoid cppcoreguidelines-init-variables. NOLINT | |
| 267 |
2/2✓ Branch 0 taken 6 times.
✓ Branch 1 taken 9210 times.
|
9216 | if (__builtin_mul_overflow(src[i], kernel_x_[i], &temp)) { |
| 268 | 6 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 269 | 6 | return; | |
| 270 | } | ||
| 271 |
2/2✓ Branch 0 taken 51 times.
✓ Branch 1 taken 9159 times.
|
9210 | if (__builtin_add_overflow(acc, temp, &acc)) { |
| 272 | 51 | dst[0] = std::numeric_limits<SourceType>::max(); | |
| 273 | 51 | return; | |
| 274 | } | ||
| 275 | 9216 | } | |
| 276 | |||
| 277 | 2253 | dst[0] = acc; | |
| 278 | 8700 | } | |
| 279 | |||
| 280 | private: | ||
| 281 | const SourceType *kernel_x_; | ||
| 282 | |||
| 283 | BufferVectorType &kernel_x_0_u32_; | ||
| 284 | BufferVectorType &kernel_x_1_u32_; | ||
| 285 | BufferVectorType &kernel_x_2_u32_; | ||
| 286 | BufferVectorType &kernel_x_3_u32_; | ||
| 287 | BufferVectorType &kernel_x_4_u32_; | ||
| 288 | |||
| 289 | SourceVectorType &kernel_y_0_u16_; | ||
| 290 | SourceVectorType &kernel_y_1_u16_; | ||
| 291 | SourceVectorType &kernel_y_2_u16_; | ||
| 292 | SourceVectorType &kernel_y_3_u16_; | ||
| 293 | SourceVectorType &kernel_y_4_u16_; | ||
| 294 | }; // end of class SeparableFilter2D<uint16_t, 5> | ||
| 295 | |||
| 296 | template <> | ||
| 297 | class SeparableFilter2D<int16_t, 5> { | ||
| 298 | public: | ||
| 299 | using SourceType = int16_t; | ||
| 300 | using SourceVectorType = typename VecTraits<SourceType>::VectorType; | ||
| 301 | using BufferType = int32_t; | ||
| 302 | using BufferVectorType = typename VecTraits<BufferType>::VectorType; | ||
| 303 | using BufferDoubleVectorType = typename VecTraits<BufferType>::Vector2Type; | ||
| 304 | using DestinationType = int16_t; | ||
| 305 | |||
| 306 | 162 | SeparableFilter2D( | |
| 307 | const SourceType *kernel_x, BufferVectorType &kernel_x_0_s32, | ||
| 308 | BufferVectorType &kernel_x_1_s32, BufferVectorType &kernel_x_2_s32, | ||
| 309 | BufferVectorType &kernel_x_3_s32, BufferVectorType &kernel_x_4_s32, | ||
| 310 | SourceVectorType &kernel_y_0_s16, SourceVectorType &kernel_y_1_s16, | ||
| 311 | SourceVectorType &kernel_y_2_s16, SourceVectorType &kernel_y_3_s16, | ||
| 312 | SourceVectorType &kernel_y_4_s16) | ||
| 313 | 162 | : kernel_x_(kernel_x), | |
| 314 | 162 | kernel_x_0_s32_(kernel_x_0_s32), | |
| 315 | 162 | kernel_x_1_s32_(kernel_x_1_s32), | |
| 316 | 162 | kernel_x_2_s32_(kernel_x_2_s32), | |
| 317 | 162 | kernel_x_3_s32_(kernel_x_3_s32), | |
| 318 | 162 | kernel_x_4_s32_(kernel_x_4_s32), | |
| 319 | |||
| 320 | 162 | kernel_y_0_s16_(kernel_y_0_s16), | |
| 321 | 162 | kernel_y_1_s16_(kernel_y_1_s16), | |
| 322 | 162 | kernel_y_2_s16_(kernel_y_2_s16), | |
| 323 | 162 | kernel_y_3_s16_(kernel_y_3_s16), | |
| 324 | 162 | kernel_y_4_s16_(kernel_y_4_s16) {} | |
| 325 | |||
| 326 | 2896 | void vertical_vector_path(svbool_t pg, | |
| 327 | std::reference_wrapper<SourceVectorType> src[5], | ||
| 328 | BufferType *dst) const KLEIDICV_STREAMING { | ||
| 329 | // 0 | ||
| 330 | 2896 | BufferVectorType acc_b = svmullb_s32(src[0], kernel_y_0_s16_); | |
| 331 | 2896 | BufferVectorType acc_t = svmullt_s32(src[0], kernel_y_0_s16_); | |
| 332 | |||
| 333 | // 1 | ||
| 334 | 2896 | BufferVectorType vec_b = svmullb_s32(src[1], kernel_y_1_s16_); | |
| 335 | 2896 | BufferVectorType vec_t = svmullt_s32(src[1], kernel_y_1_s16_); | |
| 336 | 2896 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
| 337 | 2896 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
| 338 | |||
| 339 | // 2 | ||
| 340 | 2896 | vec_b = svmullb_s32(src[2], kernel_y_2_s16_); | |
| 341 | 2896 | vec_t = svmullt_s32(src[2], kernel_y_2_s16_); | |
| 342 | 2896 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
| 343 | 2896 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
| 344 | |||
| 345 | // 3 | ||
| 346 | 2896 | vec_b = svmullb_s32(src[3], kernel_y_3_s16_); | |
| 347 | 2896 | vec_t = svmullt_s32(src[3], kernel_y_3_s16_); | |
| 348 | 2896 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
| 349 | 2896 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
| 350 | |||
| 351 | // 4 | ||
| 352 | 2896 | vec_b = svmullb_s32(src[4], kernel_y_4_s16_); | |
| 353 | 2896 | vec_t = svmullt_s32(src[4], kernel_y_4_s16_); | |
| 354 | 2896 | acc_b = svqadd_s32_x(pg, acc_b, vec_b); | |
| 355 | 2896 | acc_t = svqadd_s32_x(pg, acc_t, vec_t); | |
| 356 | |||
| 357 | 2896 | BufferDoubleVectorType interleaved = svcreate2_s32(acc_b, acc_t); | |
| 358 | 2896 | svst2(pg, &dst[0], interleaved); | |
| 359 | 2896 | } | |
| 360 | |||
| 361 | 2830 | void horizontal_vector_path(svbool_t pg, | |
| 362 | std::reference_wrapper<BufferVectorType> src[5], | ||
| 363 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 364 | // 0 | ||
| 365 | 2830 | svint64_t acc_b = svmullb_s64(src[0], kernel_x_0_s32_); | |
| 366 | 2830 | svint64_t acc_t = svmullt_s64(src[0], kernel_x_0_s32_); | |
| 367 | |||
| 368 | // 1 | ||
| 369 | 2830 | acc_b = svmlalb_s64(acc_b, src[1], kernel_x_1_s32_); | |
| 370 | 2830 | acc_t = svmlalt_s64(acc_t, src[1], kernel_x_1_s32_); | |
| 371 | |||
| 372 | // 2 | ||
| 373 | 2830 | acc_b = svmlalb_s64(acc_b, src[2], kernel_x_2_s32_); | |
| 374 | 2830 | acc_t = svmlalt_s64(acc_t, src[2], kernel_x_2_s32_); | |
| 375 | |||
| 376 | // 3 | ||
| 377 | 2830 | acc_b = svmlalb_s64(acc_b, src[3], kernel_x_3_s32_); | |
| 378 | 2830 | acc_t = svmlalt_s64(acc_t, src[3], kernel_x_3_s32_); | |
| 379 | |||
| 380 | // 4 | ||
| 381 | 2830 | acc_b = svmlalb_s64(acc_b, src[4], kernel_x_4_s32_); | |
| 382 | 2830 | acc_t = svmlalt_s64(acc_t, src[4], kernel_x_4_s32_); | |
| 383 | |||
| 384 | 2830 | svint32_t acc_s32_b = svqxtnb_s64(acc_b); | |
| 385 | 2830 | svint32_t acc_s32 = svqxtnt_s64(acc_s32_b, acc_t); | |
| 386 | |||
| 387 | 5660 | svbool_t less = | |
| 388 | 2830 | svcmplt_n_s32(pg, acc_s32, std::numeric_limits<SourceType>::min()); | |
| 389 | 2830 | acc_s32 = | |
| 390 | 2830 | svdup_n_s32_m(acc_s32, less, std::numeric_limits<SourceType>::min()); | |
| 391 | |||
| 392 | 5660 | svbool_t greater = | |
| 393 | 2830 | svcmpgt_n_s32(pg, acc_s32, std::numeric_limits<SourceType>::max()); | |
| 394 | 2830 | acc_s32 = | |
| 395 | 2830 | svdup_n_s32_m(acc_s32, greater, std::numeric_limits<SourceType>::max()); | |
| 396 | |||
| 397 | 2830 | svst1h_s32(pg, &dst[0], acc_s32); | |
| 398 | 2830 | } | |
| 399 | |||
| 400 | 8640 | void horizontal_scalar_path(const BufferType src[5], | |
| 401 | DestinationType *dst) const KLEIDICV_STREAMING { | ||
| 402 | 8640 | int64_t acc = static_cast<int64_t>(src[0]) * kernel_x_[0]; | |
| 403 |
2/2✓ Branch 0 taken 34560 times.
✓ Branch 1 taken 8640 times.
|
43200 | for (size_t i = 1; i < 5; i++) { |
| 404 | 34560 | acc += static_cast<int64_t>(src[i]) * kernel_x_[i]; | |
| 405 | 34560 | } | |
| 406 | |||
| 407 |
2/2✓ Branch 0 taken 3114 times.
✓ Branch 1 taken 5526 times.
|
8640 | if (acc < std::numeric_limits<DestinationType>::min()) { |
| 408 | 3114 | acc = std::numeric_limits<DestinationType>::min(); | |
| 409 |
2/2✓ Branch 0 taken 2371 times.
✓ Branch 1 taken 3155 times.
|
8640 | } else if (acc > std::numeric_limits<DestinationType>::max()) { |
| 410 | 3155 | acc = std::numeric_limits<DestinationType>::max(); | |
| 411 | 3155 | } | |
| 412 | |||
| 413 | 8640 | dst[0] = static_cast<DestinationType>(acc); | |
| 414 | 8640 | } | |
| 415 | |||
| 416 | private: | ||
| 417 | const SourceType *kernel_x_; | ||
| 418 | |||
| 419 | BufferVectorType &kernel_x_0_s32_; | ||
| 420 | BufferVectorType &kernel_x_1_s32_; | ||
| 421 | BufferVectorType &kernel_x_2_s32_; | ||
| 422 | BufferVectorType &kernel_x_3_s32_; | ||
| 423 | BufferVectorType &kernel_x_4_s32_; | ||
| 424 | |||
| 425 | SourceVectorType &kernel_y_0_s16_; | ||
| 426 | SourceVectorType &kernel_y_1_s16_; | ||
| 427 | SourceVectorType &kernel_y_2_s16_; | ||
| 428 | SourceVectorType &kernel_y_3_s16_; | ||
| 429 | SourceVectorType &kernel_y_4_s16_; | ||
| 430 | }; // end of class SeparableFilter2D<int16_t, 5> | ||
| 431 | |||
| 432 | template <typename T> | ||
| 433 | 600 | static kleidicv_error_t separable_filter_2d_checks( | |
| 434 | const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, | ||
| 435 | size_t height, size_t channels, const T *kernel_x, const T *kernel_y, | ||
| 436 | SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
| 437 |
6/6✓ Branch 0 taken 9 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 9 times.
✓ Branch 3 taken 192 times.
✓ Branch 4 taken 9 times.
✓ Branch 5 taken 189 times.
|
600 | CHECK_POINTERS(workspace, kernel_x, kernel_y); |
| 438 | |||
| 439 |
12/12✓ Branch 0 taken 3 times.
✓ Branch 1 taken 189 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 189 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 189 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 189 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 186 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 186 times.
|
573 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
| 440 |
12/12✓ Branch 0 taken 3 times.
✓ Branch 1 taken 186 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 186 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 186 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 186 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 183 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 183 times.
|
564 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
| 441 |
18/18✓ Branch 0 taken 3 times.
✓ Branch 1 taken 183 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 180 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 180 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 183 times.
✓ Branch 8 taken 3 times.
✓ Branch 9 taken 180 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 180 times.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 180 times.
✓ Branch 14 taken 3 times.
✓ Branch 15 taken 177 times.
✓ Branch 16 taken 6 times.
✓ Branch 17 taken 177 times.
|
555 | CHECK_IMAGE_SIZE(width, height); |
| 442 | |||
| 443 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 177 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 177 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 174 times.
|
537 | if (channels > KLEIDICV_MAXIMUM_CHANNEL_COUNT) { |
| 444 | 9 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 445 | } | ||
| 446 | |||
| 447 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 174 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 174 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 171 times.
|
528 | if (workspace->channels() < channels) { |
| 448 | 9 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
| 449 | } | ||
| 450 | |||
| 451 | 519 | const Rectangle &context_rect = workspace->image_size(); | |
| 452 |
12/12✓ Branch 0 taken 168 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 165 times.
✓ Branch 4 taken 168 times.
✓ Branch 5 taken 6 times.
✓ Branch 6 taken 3 times.
✓ Branch 7 taken 165 times.
✓ Branch 8 taken 165 times.
✓ Branch 9 taken 6 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 162 times.
|
519 | if (context_rect.width() < width || context_rect.height() < height) { |
| 453 | 27 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
| 454 | } | ||
| 455 | |||
| 456 | 492 | return KLEIDICV_OK; | |
| 457 | 600 | } | |
| 458 | |||
| 459 | template <typename T> | ||
| 460 | 600 | kleidicv_error_t separable_filter_2d_stripe_sc( | |
| 461 | const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, | ||
| 462 | size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
| 463 | const T *kernel_x, size_t /*kernel_width*/, const T *kernel_y, | ||
| 464 | size_t /*kernel_height*/, FixedBorderType fixed_border_type, | ||
| 465 | kleidicv_filter_context_t *context) KLEIDICV_STREAMING { | ||
| 466 | 600 | auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context); | |
| 467 | 1200 | kleidicv_error_t checks_result = separable_filter_2d_checks( | |
| 468 | 600 | src, src_stride, dst, dst_stride, width, height, channels, kernel_x, | |
| 469 | 600 | kernel_y, workspace); | |
| 470 | |||
| 471 |
6/6✓ Branch 0 taken 36 times.
✓ Branch 1 taken 165 times.
✓ Branch 2 taken 36 times.
✓ Branch 3 taken 165 times.
✓ Branch 4 taken 36 times.
✓ Branch 5 taken 162 times.
|
600 | if (checks_result != KLEIDICV_OK) { |
| 472 | 108 | return checks_result; | |
| 473 | } | ||
| 474 | |||
| 475 | 492 | Rectangle rect{width, height}; | |
| 476 | |||
| 477 | using SeparableFilterClass = SeparableFilter2D<T, 5>; | ||
| 478 | |||
| 479 | using WiderT = typename double_element_width<T>::type; | ||
| 480 | using KernelXVectorTraits = VecTraits<WiderT>; | ||
| 481 | using KernelXVectorT = typename KernelXVectorTraits::VectorType; | ||
| 482 | using KernelYVectorTraits = VecTraits<T>; | ||
| 483 | using KernelYVectorT = typename KernelYVectorTraits::VectorType; | ||
| 484 | |||
| 485 | 492 | KernelXVectorT kernel_x_0 = KernelXVectorTraits::svdup(kernel_x[0]); | |
| 486 | 492 | KernelXVectorT kernel_x_1 = KernelXVectorTraits::svdup(kernel_x[1]); | |
| 487 | 492 | KernelXVectorT kernel_x_2 = KernelXVectorTraits::svdup(kernel_x[2]); | |
| 488 | 492 | KernelXVectorT kernel_x_3 = KernelXVectorTraits::svdup(kernel_x[3]); | |
| 489 | 492 | KernelXVectorT kernel_x_4 = KernelXVectorTraits::svdup(kernel_x[4]); | |
| 490 | |||
| 491 | 492 | KernelYVectorT kernel_y_0 = KernelYVectorTraits::svdup(kernel_y[0]); | |
| 492 | 492 | KernelYVectorT kernel_y_1 = KernelYVectorTraits::svdup(kernel_y[1]); | |
| 493 | 492 | KernelYVectorT kernel_y_2 = KernelYVectorTraits::svdup(kernel_y[2]); | |
| 494 | 492 | KernelYVectorT kernel_y_3 = KernelYVectorTraits::svdup(kernel_y[3]); | |
| 495 | 492 | KernelYVectorT kernel_y_4 = KernelYVectorTraits::svdup(kernel_y[4]); | |
| 496 | |||
| 497 | 984 | SeparableFilterClass filterClass{ | |
| 498 | 492 | kernel_x, kernel_x_0, kernel_x_1, kernel_x_2, kernel_x_3, kernel_x_4, | |
| 499 | kernel_y_0, kernel_y_1, kernel_y_2, kernel_y_3, kernel_y_4}; | ||
| 500 | 492 | SeparableFilter<SeparableFilterClass, 5> filter{filterClass}; | |
| 501 | |||
| 502 | 492 | Rows<const T> src_rows{src, src_stride, channels}; | |
| 503 | 492 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
| 504 | 984 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
| 505 | 492 | fixed_border_type, filter); | |
| 506 | |||
| 507 | 492 | return KLEIDICV_OK; | |
| 508 | 600 | } | |
| 509 | |||
| 510 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 511 | |||
| 512 | #endif // KLEIDICV_SEPARABLE_FILTER_2D_SC_H | ||
| 513 |