| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/ctypes.h" | ||
| 6 | #include "kleidicv/filters/blur_and_downsample.h" | ||
| 7 | #include "kleidicv/kleidicv.h" | ||
| 8 | #include "kleidicv/neon.h" | ||
| 9 | #include "kleidicv/utils.h" | ||
| 10 | #include "kleidicv/workspace/blur_and_downsample_ws.h" | ||
| 11 | #include "kleidicv/workspace/border_5x5.h" | ||
| 12 | |||
| 13 | namespace kleidicv::neon { | ||
| 14 | |||
| 15 | // Applies Gaussian Blur binomial filter to even rows and columns | ||
| 16 | // | ||
| 17 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
| 18 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
| 19 | // F = 1/256 * [ 6, 24, 36, 24, 6 ] = 1/256 * [ 6 ] * [ 1, 4, 6, 4, 1 ] | ||
| 20 | // [ 4, 16, 24, 16, 4 ] [ 4 ] | ||
| 21 | // [ 1, 4, 6, 4, 1 ] [ 1 ] | ||
| 22 | class BlurAndDownsample { | ||
| 23 | public: | ||
| 24 | using SourceType = uint8_t; | ||
| 25 | using BufferType = uint16_t; | ||
| 26 | using DestinationType = uint8_t; | ||
| 27 | using SourceVecTraits = typename neon::VecTraits<SourceType>; | ||
| 28 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
| 29 | using BufferVecTraits = typename neon::VecTraits<BufferType>; | ||
| 30 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
| 31 | using BorderInfoType = | ||
| 32 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo5x5<SourceType>; | ||
| 33 | using BorderType = FixedBorderType; | ||
| 34 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
| 35 | |||
| 36 | 137 | BlurAndDownsample() | |
| 37 | 137 | : const_6_u8_half_{vdup_n_u8(6)}, | |
| 38 | 137 | const_6_u16_{vdupq_n_u16(6)}, | |
| 39 | 137 | const_4_u16_{vdupq_n_u16(4)} {} | |
| 40 | |||
| 41 | static constexpr size_t margin = 2UL; | ||
| 42 | |||
| 43 | 2511 | void process_vertical(size_t width, Rows<const SourceType> src_rows, | |
| 44 | Rows<BufferType> dst_rows, | ||
| 45 | BorderOffsets border_offsets) const { | ||
| 46 | 5022 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
| 47 | 2511 | SourceVecTraits::num_lanes()}; | |
| 48 | |||
| 49 | 22981 | loop.unroll_twice([&](ptrdiff_t index) { | |
| 50 | 20470 | const auto *src_0 = &src_rows.at(border_offsets.c0())[index]; | |
| 51 | 20470 | const auto *src_1 = &src_rows.at(border_offsets.c1())[index]; | |
| 52 | 20470 | const auto *src_2 = &src_rows.at(border_offsets.c2())[index]; | |
| 53 | 20470 | const auto *src_3 = &src_rows.at(border_offsets.c3())[index]; | |
| 54 | 20470 | const auto *src_4 = &src_rows.at(border_offsets.c4())[index]; | |
| 55 | |||
| 56 | 20470 | SourceVectorType src_a[5], src_b[5]; | |
| 57 | 20470 | src_a[0] = vld1q(&src_0[0]); | |
| 58 | 20470 | src_b[0] = vld1q(&src_0[SourceVecTraits::num_lanes()]); | |
| 59 | 20470 | src_a[1] = vld1q(&src_1[0]); | |
| 60 | 20470 | src_b[1] = vld1q(&src_1[SourceVecTraits::num_lanes()]); | |
| 61 | 20470 | src_a[2] = vld1q(&src_2[0]); | |
| 62 | 20470 | src_b[2] = vld1q(&src_2[SourceVecTraits::num_lanes()]); | |
| 63 | 20470 | src_a[3] = vld1q(&src_3[0]); | |
| 64 | 20470 | src_b[3] = vld1q(&src_3[SourceVecTraits::num_lanes()]); | |
| 65 | 20470 | src_a[4] = vld1q(&src_4[0]); | |
| 66 | 20470 | src_b[4] = vld1q(&src_4[SourceVecTraits::num_lanes()]); | |
| 67 | 20470 | vertical_vector_path(src_a, &dst_rows[index]); | |
| 68 | 20470 | vertical_vector_path( | |
| 69 | 40940 | src_b, &dst_rows[index + static_cast<ptrdiff_t>( | |
| 70 | 20470 | SourceVecTraits::num_lanes())]); | |
| 71 | 20470 | }); | |
| 72 | |||
| 73 | 3769 | loop.unroll_once([&](ptrdiff_t index) { | |
| 74 | 1258 | SourceVectorType src[5]; | |
| 75 | 1258 | src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]); | |
| 76 | 1258 | src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]); | |
| 77 | 1258 | src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]); | |
| 78 | 1258 | src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]); | |
| 79 | 1258 | src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]); | |
| 80 | 1258 | vertical_vector_path(src, &dst_rows[index]); | |
| 81 | 1258 | }); | |
| 82 | |||
| 83 | 6630 | loop.tail([&](ptrdiff_t index) { | |
| 84 | 4119 | SourceType src[5]; | |
| 85 | 4119 | src[0] = src_rows.at(border_offsets.c0())[index]; | |
| 86 | 4119 | src[1] = src_rows.at(border_offsets.c1())[index]; | |
| 87 | 4119 | src[2] = src_rows.at(border_offsets.c2())[index]; | |
| 88 | 4119 | src[3] = src_rows.at(border_offsets.c3())[index]; | |
| 89 | 4119 | src[4] = src_rows.at(border_offsets.c4())[index]; | |
| 90 | 4119 | vertical_scalar_path(src, &dst_rows[index]); | |
| 91 | 4119 | }); | |
| 92 | 2511 | } | |
| 93 | |||
| 94 | 2511 | void process_horizontal(size_t width, Rows<const BufferType> src_rows, | |
| 95 | Rows<DestinationType> dst_rows, | ||
| 96 | BorderOffsets border_offsets) const { | ||
| 97 |
4/4✓ Branch 0 taken 740 times.
✓ Branch 1 taken 760 times.
✓ Branch 2 taken 499 times.
✓ Branch 3 taken 512 times.
|
2511 | switch (src_rows.channels()) { |
| 98 | case 1: | ||
| 99 | 499 | process_horizontal<1>(width, src_rows, dst_rows, border_offsets); | |
| 100 | 499 | break; | |
| 101 | case 2: | ||
| 102 | 740 | process_horizontal<2>(width, src_rows, dst_rows, border_offsets); | |
| 103 | 740 | break; | |
| 104 | case 3: | ||
| 105 | 512 | process_horizontal_3channels(width, src_rows, dst_rows, border_offsets); | |
| 106 | 512 | break; | |
| 107 | default /* channel == 4 */: | ||
| 108 | 760 | process_horizontal<4>(width, src_rows, dst_rows, border_offsets); | |
| 109 | 760 | break; | |
| 110 | } | ||
| 111 | 2511 | } | |
| 112 | |||
| 113 | 5022 | void process_horizontal_borders(Rows<const BufferType> src_rows, | |
| 114 | Rows<DestinationType> dst_rows, | ||
| 115 | BorderOffsets border_offsets) const { | ||
| 116 | 5022 | const ptrdiff_t channels = static_cast<ptrdiff_t>(src_rows.channels()); | |
| 117 |
2/2✓ Branch 0 taken 5022 times.
✓ Branch 1 taken 13110 times.
|
18132 | for (ptrdiff_t channel = 0; channel < channels; ++channel) { |
| 118 | 13110 | disable_loop_vectorization(); | |
| 119 | 26220 | process_horizontal_scalar(src_rows, dst_rows, border_offsets, channel, | |
| 120 | 13110 | channel); | |
| 121 | 13110 | } | |
| 122 | 5022 | } | |
| 123 | |||
| 124 | private: | ||
| 125 | // Applies vertical filtering vector using SIMD operations. | ||
| 126 | // | ||
| 127 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 128 | 42198 | void vertical_vector_path(uint8x16_t src[5], BufferType *dst) const { | |
| 129 | 42198 | uint16x8_t acc_0_4_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[4])); | |
| 130 | 42198 | uint16x8_t acc_0_4_h = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[4])); | |
| 131 | 42198 | uint16x8_t acc_1_3_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[3])); | |
| 132 | 42198 | uint16x8_t acc_1_3_h = vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[3])); | |
| 133 | 84396 | uint16x8_t acc_l = | |
| 134 | 42198 | vmlal_u8(acc_0_4_l, vget_low_u8(src[2]), const_6_u8_half_); | |
| 135 | 84396 | uint16x8_t acc_h = | |
| 136 | 42198 | vmlal_u8(acc_0_4_h, vget_high_u8(src[2]), const_6_u8_half_); | |
| 137 | 42198 | acc_l = vmlaq_u16(acc_l, acc_1_3_l, const_4_u16_); | |
| 138 | 42198 | acc_h = vmlaq_u16(acc_h, acc_1_3_h, const_4_u16_); | |
| 139 | 42198 | vst1q(&dst[0], acc_l); | |
| 140 | 42198 | vst1q(&dst[8], acc_h); | |
| 141 | 42198 | } | |
| 142 | |||
| 143 | // Applies vertical filtering vector using scalar operations. | ||
| 144 | // | ||
| 145 | // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 146 | 4119 | void vertical_scalar_path(const SourceType src[5], BufferType *dst) const { | |
| 147 | 4119 | dst[0] = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2]; | |
| 148 | 4119 | } | |
| 149 | |||
| 150 | // Applies horizontal filtering vector using SIMD operations. | ||
| 151 | // | ||
| 152 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 153 | 77684 | uint8x8_t horizontal_vector_path(uint16x8_t src[5]) const { | |
| 154 | 77684 | uint16x8_t acc_0_4 = vaddq_u16(src[0], src[4]); | |
| 155 | 77684 | uint16x8_t acc_1_3 = vaddq_u16(src[1], src[3]); | |
| 156 | 77684 | uint16x8_t acc_u16 = vmlaq_u16(acc_0_4, src[2], const_6_u16_); | |
| 157 | 77684 | acc_u16 = vmlaq_u16(acc_u16, acc_1_3, const_4_u16_); | |
| 158 | 155368 | return vrshrn_n_u16(acc_u16, 8); | |
| 159 | 77684 | } | |
| 160 | |||
| 161 | // Applies horizontal filtering vector using scalar operations. | ||
| 162 | // | ||
| 163 | // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T | ||
| 164 | 24753 | void process_horizontal_scalar(Rows<const BufferType> src_rows, | |
| 165 | Rows<DestinationType> dst_rows, | ||
| 166 | BorderOffsets border_offsets, ptrdiff_t index, | ||
| 167 | ptrdiff_t dst_index) const { | ||
| 168 | 24753 | BufferType src[5]; | |
| 169 | 24753 | src[0] = src_rows.at(0, border_offsets.c0())[index]; | |
| 170 | 24753 | src[1] = src_rows.at(0, border_offsets.c1())[index]; | |
| 171 | 24753 | src[2] = src_rows.at(0, border_offsets.c2())[index]; | |
| 172 | 24753 | src[3] = src_rows.at(0, border_offsets.c3())[index]; | |
| 173 | 24753 | src[4] = src_rows.at(0, border_offsets.c4())[index]; | |
| 174 | |||
| 175 | 24753 | auto acc = src[0] + src[4] + 4 * (src[1] + src[3]) + 6 * src[2]; | |
| 176 | 24753 | dst_rows[dst_index] = rounding_shift_right(acc, 8); | |
| 177 | 24753 | } | |
| 178 | |||
| 179 | template <ptrdiff_t Channels> | ||
| 180 | 1999 | void process_horizontal(size_t width, Rows<const BufferType> src_rows, | |
| 181 | Rows<DestinationType> dst_rows, | ||
| 182 | BorderOffsets border_offsets) const { | ||
| 183 | 3998 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
| 184 | 1999 | BufferVecTraits::num_lanes()}; | |
| 185 | |||
| 186 | 33725 | loop.unroll_twice([&](ptrdiff_t index) { | |
| 187 | 31726 | const auto *src_0 = &src_rows.at(0, border_offsets.c0())[index]; | |
| 188 | 31726 | const auto *src_1 = &src_rows.at(0, border_offsets.c1())[index]; | |
| 189 | 31726 | const auto *src_2 = &src_rows.at(0, border_offsets.c2())[index]; | |
| 190 | 31726 | const auto *src_3 = &src_rows.at(0, border_offsets.c3())[index]; | |
| 191 | 31726 | const auto *src_4 = &src_rows.at(0, border_offsets.c4())[index]; | |
| 192 | |||
| 193 | 31726 | BufferVectorType src_a[5], src_b[5]; | |
| 194 | 31726 | src_a[0] = vld1q(&src_0[0]); | |
| 195 | 31726 | src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]); | |
| 196 | 31726 | src_a[1] = vld1q(&src_1[0]); | |
| 197 | 31726 | src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]); | |
| 198 | 31726 | src_a[2] = vld1q(&src_2[0]); | |
| 199 | 31726 | src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]); | |
| 200 | 31726 | src_a[3] = vld1q(&src_3[0]); | |
| 201 | 31726 | src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]); | |
| 202 | 31726 | src_a[4] = vld1q(&src_4[0]); | |
| 203 | 31726 | src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]); | |
| 204 | |||
| 205 | 31726 | uint8x8_t res_a = horizontal_vector_path(src_a); | |
| 206 | 31726 | uint8x8_t res_b = horizontal_vector_path(src_b); | |
| 207 | |||
| 208 | // Only store even indices. | ||
| 209 | if constexpr (Channels == 1) { | ||
| 210 | 2228 | vst1(&dst_rows[index / 2], vuzp1_u8(res_a, res_b)); | |
| 211 | } else if constexpr (Channels == 2) { | ||
| 212 | 18800 | vst1(&dst_rows[index / 2], | |
| 213 | 18800 | vreinterpret_u8_u16(vuzp1_u16(vreinterpret_u16_u8(res_a), | |
| 214 | 9400 | vreinterpret_u16_u8(res_b)))); | |
| 215 | } else { | ||
| 216 | static_assert(Channels == 4); | ||
| 217 | 40196 | vst1(&dst_rows[index / 2], | |
| 218 | 40196 | vreinterpret_u8_u32(vuzp1_u32(vreinterpret_u32_u8(res_a), | |
| 219 | 20098 | vreinterpret_u32_u8(res_b)))); | |
| 220 | } | ||
| 221 | 31726 | }); | |
| 222 | |||
| 223 | 3314 | loop.remaining([&](ptrdiff_t index, size_t max_index) { | |
| 224 | 1315 | ptrdiff_t pixel = index / Channels; | |
| 225 | 1315 | pixel = align_up(pixel, static_cast<ptrdiff_t>(2)); | |
| 226 | 1315 | index = pixel * Channels; | |
| 227 |
6/6✓ Branch 0 taken 1851 times.
✓ Branch 1 taken 413 times.
✓ Branch 2 taken 1348 times.
✓ Branch 3 taken 586 times.
✓ Branch 4 taken 316 times.
✓ Branch 5 taken 316 times.
|
4830 | while (index < static_cast<ptrdiff_t>(max_index)) { |
| 228 |
6/6✓ Branch 0 taken 1851 times.
✓ Branch 1 taken 1851 times.
✓ Branch 2 taken 2696 times.
✓ Branch 3 taken 1348 times.
✓ Branch 4 taken 1264 times.
✓ Branch 5 taken 316 times.
|
9326 | for (ptrdiff_t channel = 0; channel < Channels; ++channel) { |
| 229 | 11622 | process_horizontal_scalar(src_rows, dst_rows, border_offsets, | |
| 230 | 5811 | index + channel, index / 2 + channel); | |
| 231 | 5811 | } | |
| 232 | 3515 | index += 2 * Channels; | |
| 233 | } | ||
| 234 | 1315 | }); | |
| 235 | 1999 | } | |
| 236 | |||
| 237 | 512 | void process_horizontal_3channels(size_t width, | |
| 238 | Rows<const BufferType> src_rows, | ||
| 239 | Rows<DestinationType> dst_rows, | ||
| 240 | BorderOffsets border_offsets) const { | ||
| 241 | 512 | constexpr ptrdiff_t channels = 3; | |
| 242 | 512 | const ptrdiff_t vec_stride = | |
| 243 | static_cast<ptrdiff_t>(BufferVecTraits::num_lanes()) * channels; | ||
| 244 | 512 | LoopUnroll2<TryToAvoidTailLoop> loop{width, BufferVecTraits::num_lanes()}; | |
| 245 | |||
| 246 | 2884 | loop.unroll_twice([&](ptrdiff_t column) { | |
| 247 | 4744 | const auto *src_0 = | |
| 248 | 2372 | &src_rows.at(0, border_offsets.c0())[column * channels]; | |
| 249 | 4744 | const auto *src_1 = | |
| 250 | 2372 | &src_rows.at(0, border_offsets.c1())[column * channels]; | |
| 251 | 4744 | const auto *src_2 = | |
| 252 | 2372 | &src_rows.at(0, border_offsets.c2())[column * channels]; | |
| 253 | 4744 | const auto *src_3 = | |
| 254 | 2372 | &src_rows.at(0, border_offsets.c3())[column * channels]; | |
| 255 | 4744 | const auto *src_4 = | |
| 256 | 2372 | &src_rows.at(0, border_offsets.c4())[column * channels]; | |
| 257 | |||
| 258 | 2372 | uint16x8x3_t src0_a = vld3q_u16(&src_0[0]); | |
| 259 | 2372 | uint16x8x3_t src1_a = vld3q_u16(&src_1[0]); | |
| 260 | 2372 | uint16x8x3_t src2_a = vld3q_u16(&src_2[0]); | |
| 261 | 2372 | uint16x8x3_t src3_a = vld3q_u16(&src_3[0]); | |
| 262 | 2372 | uint16x8x3_t src4_a = vld3q_u16(&src_4[0]); | |
| 263 | |||
| 264 | 2372 | uint16x8x3_t src0_b = vld3q_u16(&src_0[vec_stride]); | |
| 265 | 2372 | uint16x8x3_t src1_b = vld3q_u16(&src_1[vec_stride]); | |
| 266 | 2372 | uint16x8x3_t src2_b = vld3q_u16(&src_2[vec_stride]); | |
| 267 | 2372 | uint16x8x3_t src3_b = vld3q_u16(&src_3[vec_stride]); | |
| 268 | 2372 | uint16x8x3_t src4_b = vld3q_u16(&src_4[vec_stride]); | |
| 269 | |||
| 270 | 7116 | uint16x8_t ch0_a[5] = {src0_a.val[0], src1_a.val[0], src2_a.val[0], | |
| 271 | 4744 | src3_a.val[0], src4_a.val[0]}; | |
| 272 | 7116 | uint16x8_t ch0_b[5] = {src0_b.val[0], src1_b.val[0], src2_b.val[0], | |
| 273 | 4744 | src3_b.val[0], src4_b.val[0]}; | |
| 274 | 7116 | uint16x8_t ch1_a[5] = {src0_a.val[1], src1_a.val[1], src2_a.val[1], | |
| 275 | 4744 | src3_a.val[1], src4_a.val[1]}; | |
| 276 | 7116 | uint16x8_t ch1_b[5] = {src0_b.val[1], src1_b.val[1], src2_b.val[1], | |
| 277 | 4744 | src3_b.val[1], src4_b.val[1]}; | |
| 278 | 7116 | uint16x8_t ch2_a[5] = {src0_a.val[2], src1_a.val[2], src2_a.val[2], | |
| 279 | 4744 | src3_a.val[2], src4_a.val[2]}; | |
| 280 | 7116 | uint16x8_t ch2_b[5] = {src0_b.val[2], src1_b.val[2], src2_b.val[2], | |
| 281 | 4744 | src3_b.val[2], src4_b.val[2]}; | |
| 282 | |||
| 283 | 2372 | uint8x8_t res0_a = horizontal_vector_path(ch0_a); | |
| 284 | 2372 | uint8x8_t res0_b = horizontal_vector_path(ch0_b); | |
| 285 | 2372 | uint8x8_t res1_a = horizontal_vector_path(ch1_a); | |
| 286 | 2372 | uint8x8_t res1_b = horizontal_vector_path(ch1_b); | |
| 287 | 2372 | uint8x8_t res2_a = horizontal_vector_path(ch2_a); | |
| 288 | 2372 | uint8x8_t res2_b = horizontal_vector_path(ch2_b); | |
| 289 | |||
| 290 | 2372 | uint8x8_t out0 = vuzp1_u8(res0_a, res0_b); | |
| 291 | 2372 | uint8x8_t out1 = vuzp1_u8(res1_a, res1_b); | |
| 292 | 2372 | uint8x8_t out2 = vuzp1_u8(res2_a, res2_b); | |
| 293 | |||
| 294 | 2372 | uint8x8x3_t interleaved{out0, out1, out2}; | |
| 295 | 2372 | vst3_u8(&dst_rows.at(0, column / 2)[0], interleaved); | |
| 296 | 2372 | }); | |
| 297 | |||
| 298 | 938 | loop.remaining([&](ptrdiff_t column, size_t max_column) { | |
| 299 | 426 | column = align_up(column, 2); | |
| 300 |
2/2✓ Branch 0 taken 1944 times.
✓ Branch 1 taken 426 times.
|
2370 | while (column < static_cast<ptrdiff_t>(max_column)) { |
| 301 | 1944 | Rows<const BufferType> src_row = src_rows.at(0, column); | |
| 302 | 1944 | Rows<DestinationType> dst_row = dst_rows.at(0, column / 2); | |
| 303 |
2/2✓ Branch 0 taken 5832 times.
✓ Branch 1 taken 1944 times.
|
7776 | for (ptrdiff_t channel = 0; channel < channels; ++channel) { |
| 304 | 11664 | process_horizontal_scalar(src_row, dst_row, border_offsets, channel, | |
| 305 | 5832 | channel); | |
| 306 | 5832 | } | |
| 307 | 1944 | column += 2; | |
| 308 | 1944 | } | |
| 309 | 426 | }); | |
| 310 | 512 | } | |
| 311 | |||
| 312 | uint8x8_t const_6_u8_half_; | ||
| 313 | uint16x8_t const_6_u16_; | ||
| 314 | uint16x8_t const_4_u16_; | ||
| 315 | }; // end of class BlurAndDownsample | ||
| 316 | |||
| 317 | KLEIDICV_TARGET_FN_ATTRS | ||
| 318 | 142 | kleidicv_error_t kleidicv_blur_and_downsample_stripe_u8( | |
| 319 | const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height, | ||
| 320 | uint8_t *dst, size_t dst_stride, size_t y_begin, size_t y_end, | ||
| 321 | size_t channels, FixedBorderType fixed_border_type) { | ||
| 322 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 141 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 141 times.
|
142 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 323 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 140 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 140 times.
|
141 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, (src_height + 1) / 2); |
| 324 |
6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 139 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 138 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 138 times.
|
140 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 325 | |||
| 326 | 138 | Rectangle rect{src_width, src_height}; | |
| 327 | 138 | constexpr size_t intermediate_size{ | |
| 328 | sizeof(typename BlurAndDownsample::BufferType)}; | ||
| 329 | |||
| 330 | 276 | auto workspace_variant = BlurAndDownsampleFilterWorkspace::create( | |
| 331 | 138 | rect, channels, intermediate_size); | |
| 332 |
4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 137 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 137 times.
|
139 | if (auto *err = std::get_if<kleidicv_error_t>(&workspace_variant)) { |
| 333 | 1 | return *err; | |
| 334 | } | ||
| 335 | 274 | auto &workspace = | |
| 336 | 137 | *std::get_if<BlurAndDownsampleFilterWorkspace>(&workspace_variant); | |
| 337 | |||
| 338 | 137 | Rows<const uint8_t> src_rows{src, src_stride, channels}; | |
| 339 | 137 | Rows<uint8_t> dst_rows{dst, dst_stride, channels}; | |
| 340 | 274 | workspace.process(y_begin, y_end, src_rows, dst_rows, fixed_border_type, | |
| 341 | 137 | BlurAndDownsample{}); | |
| 342 | |||
| 343 | 137 | return KLEIDICV_OK; | |
| 344 | 142 | } | |
| 345 | |||
| 346 | } // namespace kleidicv::neon | ||
| 347 |