| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/ctypes.h" | ||
| 6 | #include "kleidicv/filters/blur_and_downsample.h" | ||
| 7 | #include "kleidicv/kleidicv.h" | ||
| 8 | #include "kleidicv/sve2.h" | ||
| 9 | #include "kleidicv/utils.h" | ||
| 10 | #include "kleidicv/workspace/blur_and_downsample_ws.h" | ||
| 11 | #include "kleidicv/workspace/border_5x5.h" | ||
| 12 | |||
| 13 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 14 | |||
| 15 | // Applies Gaussian Blur binomial filter to even rows and columns | ||
| 16 | // | ||
| 17 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
| 18 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
| 19 | // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] | ||
| 20 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
| 21 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
| 22 | class BlurAndDownsample { | ||
| 23 | public: | ||
| 24 | using SourceType = uint8_t; | ||
| 25 | using BufferType = uint16_t; | ||
| 26 | using DestinationType = uint8_t; | ||
| 27 | using SourceVecTraits = | ||
| 28 | typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<SourceType>; | ||
| 29 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
| 30 | using SourceVector2Type = typename SourceVecTraits::Vector2Type; | ||
| 31 | using BufferVecTraits = | ||
| 32 | typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits<BufferType>; | ||
| 33 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
| 34 | using BorderInfoType = | ||
| 35 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>; | ||
| 36 | using BorderType = FixedBorderType; | ||
| 37 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
| 38 | |||
| 39 | static constexpr size_t margin = 2UL; | ||
| 40 | |||
| 41 | 1473 | void process_vertical(size_t width, Rows<const SourceType> src_rows, | |
| 42 | Rows<BufferType> dst_rows, | ||
| 43 | BorderOffsets border_offsets) const KLEIDICV_STREAMING { | ||
| 44 | 1473 | LoopUnroll2 loop{width * src_rows.channels(), SourceVecTraits::num_lanes()}; | |
| 45 | |||
| 46 | 1639 | loop.unroll_twice([&](ptrdiff_t index) KLEIDICV_STREAMING { | |
| 47 | 166 | svbool_t pg_all = SourceVecTraits::svptrue(); | |
| 48 | 332 | vertical_vector_path_2x(pg_all, src_rows, dst_rows, border_offsets, | |
| 49 | 166 | index); | |
| 50 | 166 | }); | |
| 51 | |||
| 52 | 1497 | loop.unroll_once([&](ptrdiff_t index) KLEIDICV_STREAMING { | |
| 53 | 24 | svbool_t pg_all = SourceVecTraits::svptrue(); | |
| 54 | 48 | vertical_vector_path_1x(pg_all, src_rows, dst_rows, border_offsets, | |
| 55 | 24 | index); | |
| 56 | 24 | }); | |
| 57 | |||
| 58 | 2922 | loop.remaining([&](ptrdiff_t index, ptrdiff_t length) KLEIDICV_STREAMING { | |
| 59 | 1449 | svbool_t pg = SourceVecTraits::svwhilelt(index, length); | |
| 60 | 1449 | vertical_vector_path_1x(pg, src_rows, dst_rows, border_offsets, index); | |
| 61 | 1449 | }); | |
| 62 | 1473 | } | |
| 63 | |||
| 64 | 1473 | void process_horizontal(size_t width, Rows<const BufferType> src_rows, | |
| 65 | Rows<DestinationType> dst_rows, | ||
| 66 | BorderOffsets border_offsets) const | ||
| 67 | KLEIDICV_STREAMING { | ||
| 68 | 1473 | svbool_t pg_all = BufferVecTraits::svptrue(); | |
| 69 | 1473 | LoopUnroll2 loop{width * src_rows.channels(), BufferVecTraits::num_lanes()}; | |
| 70 | |||
| 71 | 1709 | loop.unroll_twice([&](size_t index) KLEIDICV_STREAMING { | |
| 72 | 472 | horizontal_vector_path_2x(pg_all, pg_all, src_rows, pg_all, dst_rows, | |
| 73 | 236 | border_offsets, static_cast<ptrdiff_t>(index)); | |
| 74 | 236 | }); | |
| 75 | |||
| 76 | 2112 | loop.remaining([&](size_t index, size_t length) KLEIDICV_STREAMING { | |
| 77 | 639 | svbool_t pg_src_0 = BufferVecTraits::svwhilelt(index, length); | |
| 78 | 1278 | svbool_t pg_src_1 = BufferVecTraits::svwhilelt( | |
| 79 | 639 | index + BufferVecTraits::num_lanes(), length); | |
| 80 | 1278 | svbool_t pg_dst = | |
| 81 | 639 | BufferVecTraits::svwhilelt((index + 1) / 2, (length + 1) / 2); | |
| 82 | 1278 | horizontal_vector_path_2x(pg_src_0, pg_src_1, src_rows, pg_dst, dst_rows, | |
| 83 | 639 | border_offsets, static_cast<ptrdiff_t>(index)); | |
| 84 | 639 | }); | |
| 85 | 1473 | } | |
| 86 | |||
| 87 | 2946 | void process_horizontal_borders( | |
| 88 | Rows<const BufferType> src_rows, Rows<DestinationType> dst_rows, | ||
| 89 | BorderOffsets border_offsets) const KLEIDICV_STREAMING { | ||
| 90 |
2/2✓ Branch 0 taken 2946 times.
✓ Branch 1 taken 2946 times.
|
5892 | for (ptrdiff_t index = 0; |
| 91 | 5892 | index < static_cast<ptrdiff_t>(src_rows.channels()); ++index) { | |
| 92 | 2946 | disable_loop_vectorization(); | |
| 93 | 2946 | svbool_t pg = svptrue_pat_b8(SV_VL1); | |
| 94 | 2946 | horizontal_border_path(pg, src_rows, dst_rows, border_offsets, index); | |
| 95 | 2946 | } | |
| 96 | 2946 | } | |
| 97 | |||
| 98 | private: | ||
| 99 | 166 | void vertical_vector_path_2x(svbool_t pg, Rows<const SourceType> src_rows, | |
| 100 | Rows<BufferType> dst_rows, | ||
| 101 | BorderOffsets border_offsets, | ||
| 102 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
| 103 | 166 | const auto *src_row_0 = &src_rows.at(border_offsets.c0())[index]; | |
| 104 | 166 | const auto *src_row_1 = &src_rows.at(border_offsets.c1())[index]; | |
| 105 | 166 | const auto *src_row_2 = &src_rows.at(border_offsets.c2())[index]; | |
| 106 | 166 | const auto *src_row_3 = &src_rows.at(border_offsets.c3())[index]; | |
| 107 | 166 | const auto *src_row_4 = &src_rows.at(border_offsets.c4())[index]; | |
| 108 | |||
| 109 | 166 | SourceVector2Type src_0; | |
| 110 | 166 | SourceVector2Type src_1; | |
| 111 | 166 | SourceVector2Type src_2; | |
| 112 | 166 | SourceVector2Type src_3; | |
| 113 | 166 | SourceVector2Type src_4; | |
| 114 | |||
| 115 | 166 | src_0 = | |
| 116 | 166 | svcreate2(svld1(pg, &src_row_0[0]), svld1_vnum(pg, &src_row_0[0], 1)); | |
| 117 | 166 | src_1 = | |
| 118 | 166 | svcreate2(svld1(pg, &src_row_1[0]), svld1_vnum(pg, &src_row_1[0], 1)); | |
| 119 | 166 | src_2 = | |
| 120 | 166 | svcreate2(svld1(pg, &src_row_2[0]), svld1_vnum(pg, &src_row_2[0], 1)); | |
| 121 | 166 | src_3 = | |
| 122 | 166 | svcreate2(svld1(pg, &src_row_3[0]), svld1_vnum(pg, &src_row_3[0], 1)); | |
| 123 | 166 | src_4 = | |
| 124 | 166 | svcreate2(svld1(pg, &src_row_4[0]), svld1_vnum(pg, &src_row_4[0], 1)); | |
| 125 | |||
| 126 | 332 | vertical_vector_path(pg, svget2(src_0, 0), svget2(src_1, 0), | |
| 127 | 166 | svget2(src_2, 0), svget2(src_3, 0), svget2(src_4, 0), | |
| 128 | 166 | &dst_rows[index]); | |
| 129 | 332 | vertical_vector_path(pg, svget2(src_0, 1), svget2(src_1, 1), | |
| 130 | 166 | svget2(src_2, 1), svget2(src_3, 1), svget2(src_4, 1), | |
| 131 | 332 | &dst_rows[index + static_cast<ptrdiff_t>( | |
| 132 | 166 | SourceVecTraits::num_lanes())]); | |
| 133 | 166 | } | |
| 134 | |||
| 135 | 1473 | void vertical_vector_path_1x(svbool_t pg, Rows<const SourceType> src_rows, | |
| 136 | Rows<BufferType> dst_rows, | ||
| 137 | BorderOffsets border_offsets, | ||
| 138 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
| 139 | 2946 | SourceVectorType src_0 = | |
| 140 | 1473 | svld1(pg, &src_rows.at(border_offsets.c0())[index]); | |
| 141 | 2946 | SourceVectorType src_1 = | |
| 142 | 1473 | svld1(pg, &src_rows.at(border_offsets.c1())[index]); | |
| 143 | 2946 | SourceVectorType src_2 = | |
| 144 | 1473 | svld1(pg, &src_rows.at(border_offsets.c2())[index]); | |
| 145 | 2946 | SourceVectorType src_3 = | |
| 146 | 1473 | svld1(pg, &src_rows.at(border_offsets.c3())[index]); | |
| 147 | 2946 | SourceVectorType src_4 = | |
| 148 | 1473 | svld1(pg, &src_rows.at(border_offsets.c4())[index]); | |
| 149 | 2946 | vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, | |
| 150 | 1473 | &dst_rows[index]); | |
| 151 | 1473 | } | |
| 152 | |||
| 153 | // Applies vertical filtering vector using SIMD operations. | ||
| 154 | // | ||
| 155 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 156 | 1805 | void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, | |
| 157 | svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, | ||
| 158 | BufferType *dst) const KLEIDICV_STREAMING { | ||
| 159 | 1805 | svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4); | |
| 160 | 1805 | svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4); | |
| 161 | 1805 | svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3); | |
| 162 | 1805 | svuint16_t acc_1_3_t = svaddlt_u16(src_1, src_3); | |
| 163 | |||
| 164 | 1805 | svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src_2, 6); | |
| 165 | 1805 | svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src_2, 6); | |
| 166 | 1805 | acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4); | |
| 167 | 1805 | acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4); | |
| 168 | |||
| 169 | 1805 | svuint16x2_t interleaved = svcreate2(acc_u16_b, acc_u16_t); | |
| 170 | 1805 | svst2(pg, &dst[0], interleaved); | |
| 171 | 1805 | } | |
| 172 | |||
| 173 | 875 | void horizontal_vector_path_2x(svbool_t pg_src_0, svbool_t pg_src_1, | |
| 174 | Rows<const BufferType> src_rows, | ||
| 175 | svbool_t pg_dst, | ||
| 176 | Rows<DestinationType> dst_rows, | ||
| 177 | BorderOffsets border_offsets, | ||
| 178 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
| 179 | 875 | const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index]; | |
| 180 | 875 | const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index]; | |
| 181 | 875 | const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index]; | |
| 182 | 875 | const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index]; | |
| 183 | 875 | const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index]; | |
| 184 | |||
| 185 | 875 | BufferVectorType src_0_0 = svld1(pg_src_0, &src_0[0]); | |
| 186 | 875 | BufferVectorType src_1_0 = svld1_vnum(pg_src_1, &src_0[0], 1); | |
| 187 | 875 | BufferVectorType src_0_1 = svld1(pg_src_0, &src_1[0]); | |
| 188 | 875 | BufferVectorType src_1_1 = svld1_vnum(pg_src_1, &src_1[0], 1); | |
| 189 | 875 | BufferVectorType src_0_2 = svld1(pg_src_0, &src_2[0]); | |
| 190 | 875 | BufferVectorType src_1_2 = svld1_vnum(pg_src_1, &src_2[0], 1); | |
| 191 | 875 | BufferVectorType src_0_3 = svld1(pg_src_0, &src_3[0]); | |
| 192 | 875 | BufferVectorType src_1_3 = svld1_vnum(pg_src_1, &src_3[0], 1); | |
| 193 | 875 | BufferVectorType src_0_4 = svld1(pg_src_0, &src_4[0]); | |
| 194 | 875 | BufferVectorType src_1_4 = svld1_vnum(pg_src_1, &src_4[0], 1); | |
| 195 | |||
| 196 | 1750 | svuint16_t res_0 = horizontal_vector_path(pg_src_0, src_0_0, src_0_1, | |
| 197 | 875 | src_0_2, src_0_3, src_0_4); | |
| 198 | 1750 | svuint16_t res_1 = horizontal_vector_path(pg_src_1, src_1_0, src_1_1, | |
| 199 | 875 | src_1_2, src_1_3, src_1_4); | |
| 200 | |||
| 201 | 875 | svuint16_t res_even_only = svuzp1(res_0, res_1); | |
| 202 | 875 | svst1b(pg_dst, &dst_rows[index / 2], res_even_only); | |
| 203 | 875 | } | |
| 204 | |||
| 205 | // Applies horizontal filtering vector using SIMD operations. | ||
| 206 | // | ||
| 207 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 208 | 1750 | svuint16_t horizontal_vector_path(svbool_t pg, svuint16_t src_0, | |
| 209 | svuint16_t src_1, svuint16_t src_2, | ||
| 210 | svuint16_t src_3, | ||
| 211 | svuint16_t src_4) const KLEIDICV_STREAMING { | ||
| 212 | 1750 | svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); | |
| 213 | 1750 | svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); | |
| 214 | 1750 | svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); | |
| 215 | 1750 | acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); | |
| 216 | 1750 | acc = svrshr_x(pg, acc, 8); | |
| 217 | 3500 | return acc; | |
| 218 | 1750 | } | |
| 219 | |||
| 220 | // Applies horizontal filtering for the borders using SIMD operations. | ||
| 221 | // | ||
| 222 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 223 | 2946 | void horizontal_border_path(svbool_t pg, Rows<const BufferType> src_rows, | |
| 224 | Rows<DestinationType> dst_rows, | ||
| 225 | BorderOffsets border_offsets, | ||
| 226 | ptrdiff_t index) const KLEIDICV_STREAMING { | ||
| 227 | 5892 | BufferVectorType src_0 = | |
| 228 | 2946 | svld1(pg, &src_rows.at(0, border_offsets.c0())[index]); | |
| 229 | 5892 | BufferVectorType src_1 = | |
| 230 | 2946 | svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); | |
| 231 | 5892 | BufferVectorType src_2 = | |
| 232 | 2946 | svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); | |
| 233 | 5892 | BufferVectorType src_3 = | |
| 234 | 2946 | svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); | |
| 235 | 5892 | BufferVectorType src_4 = | |
| 236 | 2946 | svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); | |
| 237 | |||
| 238 | 2946 | svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); | |
| 239 | 2946 | svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); | |
| 240 | 2946 | svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); | |
| 241 | 2946 | acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); | |
| 242 | 2946 | acc = svrshr_x(pg, acc, 8); | |
| 243 | |||
| 244 | 2946 | svst1b(pg, &dst_rows[index / 2], acc); | |
| 245 | 2946 | } | |
| 246 | }; // end of class BlurAndDownsample | ||
| 247 | |||
| 248 | // Does not include checks for whether the operation is implemented. | ||
| 249 | // This must be done earlier, by blur_and_downsample_is_implemented. | ||
| 250 | 153 | static kleidicv_error_t blur_and_downsample_checks( | |
| 251 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 252 | uint8_t *dst, size_t dst_stride, size_t channels, | ||
| 253 | BlurAndDownsampleFilterWorkspace *workspace) KLEIDICV_STREAMING { | ||
| 254 |
2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 150 times.
|
153 | CHECK_POINTERS(workspace); |
| 255 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 147 times.
|
150 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 256 |
4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 144 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 144 times.
|
147 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2); |
| 257 |
6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 138 times.
|
144 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 258 | |||
| 259 | 138 | Rectangle rect{src_width, src_height}; | |
| 260 | 138 | const Rectangle &context_rect = workspace->image_size(); | |
| 261 |
4/4✓ Branch 0 taken 132 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 129 times.
|
138 | if (context_rect.width() < src_width || context_rect.height() < src_height) { |
| 262 | 9 | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
| 263 | } | ||
| 264 | |||
| 265 | // Currently supports only one channel, so it cannot be tested. | ||
| 266 | // GCOVR_EXCL_START | ||
| 267 | − | if (workspace->channels() < channels) { | |
| 268 | − | return KLEIDICV_ERROR_CONTEXT_MISMATCH; | |
| 269 | } | ||
| 270 | // GCOVR_EXCL_STOP | ||
| 271 | |||
| 272 | 129 | return KLEIDICV_OK; | |
| 273 | 153 | } | |
| 274 | |||
| 275 | 153 | static kleidicv_error_t blur_and_downsample_stripe_u8_sc( | |
| 276 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 277 | uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end, | ||
| 278 | size_t channels, FixedBorderType fixed_border_type, | ||
| 279 | kleidicv_filter_context_t *context) KLEIDICV_STREAMING { | ||
| 280 | // Does not include checks for whether the operation is implemented. | ||
| 281 | // This must be done earlier, by blur_and_downsample_is_implemented. | ||
| 282 | 306 | auto *workspace = | |
| 283 | 153 | reinterpret_cast<BlurAndDownsampleFilterWorkspace *>(context); | |
| 284 | |||
| 285 |
6/6✓ Branch 0 taken 24 times.
✓ Branch 1 taken 129 times.
✓ Branch 2 taken 24 times.
✓ Branch 3 taken 129 times.
✓ Branch 4 taken 24 times.
✓ Branch 5 taken 129 times.
|
330 | if (auto check_result = |
| 286 | 306 | blur_and_downsample_checks(src, src_stride, src_width, src_height, | |
| 287 | 153 | dst, dst_stride, channels, workspace)) { | |
| 288 | 24 | return check_result; | |
| 289 | } | ||
| 290 | |||
| 291 | 129 | Rectangle rect{src_width, src_height}; | |
| 292 | |||
| 293 | 129 | Rows<const uint8_t> src_rows{src, src_stride, channels}; | |
| 294 | 129 | Rows<uint8_t> dst_rows{dst, dst_stride, channels}; | |
| 295 | 258 | workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, | |
| 296 | 129 | fixed_border_type, BlurAndDownsample{}); | |
| 297 | |||
| 298 | 129 | return KLEIDICV_OK; | |
| 299 | 153 | } | |
| 300 | |||
| 301 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 302 |