| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_FILTER_2D_NEON_H | ||
| 6 | #define KLEIDICV_FILTER_2D_NEON_H | ||
| 7 | |||
| 8 | #include "filter_2d_window_loader_3x3.h" | ||
| 9 | #include "filter_2d_window_loader_5x5.h" | ||
| 10 | #include "filter_2d_window_loader_7x7.h" | ||
| 11 | #include "kleidicv/neon.h" | ||
| 12 | #include "process_filter_2d.h" | ||
| 13 | |||
| 14 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 15 | |||
| 16 | template <typename InnerFilterType, size_t KSize, typename WindowLoaderType> | ||
| 17 | class Filter2d { | ||
| 18 | public: | ||
| 19 | using SourceType = typename InnerFilterType::SourceType; | ||
| 20 | using DestinationType = typename InnerFilterType::DestinationType; | ||
| 21 | using SourceVecTraits = typename neon::VecTraits<SourceType>; | ||
| 22 | using DestinationVecTraits = typename neon::VecTraits<DestinationType>; | ||
| 23 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
| 24 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
| 25 | using BorderType = FixedBorderType; | ||
| 26 | static constexpr size_t kMargin = KSize / 2; | ||
| 27 | using BorderInfoType = | ||
| 28 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, KSize>; | ||
| 29 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
| 30 | |||
| 31 | 1610 | explicit Filter2d(InnerFilterType filter) : filter_{filter} {} | |
| 32 | |||
| 33 | 8676 | void process_pixels_without_horizontal_borders( | |
| 34 | size_t width, Rows<const SourceType> src_rows, | ||
| 35 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
| 36 | BorderOffsets window_col_offsets) const { | ||
| 37 | 17352 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
| 38 | 8676 | SourceVecTraits::num_lanes()}; | |
| 39 | |||
| 40 | 34632 | loop.unroll_once([&](size_t index) { | |
| 41 | 25956 | SourceVectorType src[KSize][KSize]; | |
| 42 | 25956 | DestinationVectorType dst_vec; | |
| 43 | |||
| 44 | 11224924 | auto KernelWindow = [&](size_t row, size_t col) -> SourceVectorType& { | |
| 45 | 11198968 | return src[row][col]; | |
| 46 | }; | ||
| 47 | |||
| 48 | 983368 | auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; | |
| 49 | 51912 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
| 50 | 25956 | window_row_offsets, window_col_offsets, | |
| 51 | 25956 | index); | |
| 52 | |||
| 53 | 25956 | filter_.vector_path(KernelWindow, dst_vec); | |
| 54 | 25956 | vst1q(&dst_rows[index], dst_vec); | |
| 55 | 25956 | }); | |
| 56 | |||
| 57 | 13556 | loop.tail([&](size_t index) { | |
| 58 | 4880 | process_one_element_with_horizontal_borders( | |
| 59 | 4880 | src_rows, dst_rows, window_row_offsets, window_col_offsets, index); | |
| 60 | 4880 | }); | |
| 61 | 8676 | } | |
| 62 | |||
| 63 | 2272 | void process_pixels_of_dual_rows_without_horizontal_borders( | |
| 64 | size_t width, Rows<const SourceType> src_rows, | ||
| 65 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets_0, | ||
| 66 | BorderOffsets window_row_offsets_1, | ||
| 67 | BorderOffsets window_col_offsets) const { | ||
| 68 | 4544 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
| 69 | 2272 | SourceVecTraits::num_lanes()}; | |
| 70 | |||
| 71 | 5750 | loop.unroll_once([&](size_t index) { | |
| 72 | 3478 | SourceVectorType src[KSize + 1][KSize]; | |
| 73 | 3478 | DestinationVectorType dst_vec_0; | |
| 74 | 3478 | DestinationVectorType dst_vec_1; | |
| 75 | 198246 | auto KernelWindow = [&](size_t row, size_t col) -> SourceVectorType& { | |
| 76 | 194768 | return src[row][col]; | |
| 77 | }; | ||
| 78 | |||
| 79 | 45214 | auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; | |
| 80 | 3478 | WindowLoaderType::load_window_to_handle_dual_rows( | |
| 81 | 3478 | KernelWindow, load_array_element, src_rows, window_row_offsets_0, | |
| 82 | 3478 | window_row_offsets_1, window_col_offsets, index); | |
| 83 | |||
| 84 | 3478 | filter_.vector_path_for_dual_row_handling(KernelWindow, dst_vec_0, | |
| 85 | dst_vec_1); | ||
| 86 | 3478 | vst1q(&dst_rows.at(0, 0)[index], dst_vec_0); | |
| 87 | 3478 | vst1q(&dst_rows.at(1, 0)[index], dst_vec_1); | |
| 88 | 3478 | }); | |
| 89 | |||
| 90 | 4440 | loop.tail([&](size_t index) { | |
| 91 | 2168 | process_two_element_vertically_with_or_without_horizontal_borders( | |
| 92 | 2168 | src_rows, dst_rows, window_row_offsets_0, window_row_offsets_1, | |
| 93 | 2168 | window_col_offsets, index); | |
| 94 | 2168 | }); | |
| 95 | 2272 | } | |
| 96 | |||
| 97 | 42704 | void process_one_pixel_with_horizontal_borders( | |
| 98 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
| 99 | BorderOffsets window_row_offsets, | ||
| 100 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
| 101 |
42/42✓ Branch 0 taken 64 times.
✓ Branch 1 taken 160 times.
✓ Branch 2 taken 1232 times.
✓ Branch 3 taken 3080 times.
✓ Branch 4 taken 2496 times.
✓ Branch 5 taken 6240 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 3272 times.
✓ Branch 9 taken 5120 times.
✓ Branch 10 taken 4488 times.
✓ Branch 11 taken 8232 times.
✓ Branch 12 taken 72 times.
✓ Branch 13 taken 168 times.
✓ Branch 14 taken 3272 times.
✓ Branch 15 taken 5120 times.
✓ Branch 16 taken 4488 times.
✓ Branch 17 taken 8232 times.
✓ Branch 18 taken 72 times.
✓ Branch 19 taken 168 times.
✓ Branch 20 taken 3272 times.
✓ Branch 21 taken 5120 times.
✓ Branch 22 taken 4488 times.
✓ Branch 23 taken 8232 times.
✓ Branch 24 taken 64 times.
✓ Branch 25 taken 160 times.
✓ Branch 26 taken 1232 times.
✓ Branch 27 taken 3080 times.
✓ Branch 28 taken 2496 times.
✓ Branch 29 taken 6240 times.
✓ Branch 30 taken 64 times.
✓ Branch 31 taken 160 times.
✓ Branch 32 taken 1232 times.
✓ Branch 33 taken 3080 times.
✓ Branch 34 taken 2496 times.
✓ Branch 35 taken 6240 times.
✓ Branch 36 taken 72 times.
✓ Branch 37 taken 168 times.
✓ Branch 38 taken 3272 times.
✓ Branch 39 taken 5120 times.
✓ Branch 40 taken 4488 times.
✓ Branch 41 taken 8232 times.
|
125224 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
| 102 | 82520 | disable_loop_vectorization(); | |
| 103 | 82520 | process_one_element_with_horizontal_borders( | |
| 104 | 82520 | src_rows, dst_rows, window_row_offsets, window_col_offsets, index); | |
| 105 | 82520 | } | |
| 106 | 42704 | } | |
| 107 | |||
| 108 | // Processes two vertically adjacent pixels in a single column | ||
| 109 | 4544 | void process_two_pixels_with_horizontal_borders( | |
| 110 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
| 111 | BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, | ||
| 112 | BorderOffsets window_col_offsets) const { | ||
| 113 |
14/14✓ Branch 0 taken 128 times.
✓ Branch 1 taken 320 times.
✓ Branch 2 taken 1040 times.
✓ Branch 3 taken 1232 times.
✓ Branch 4 taken 1040 times.
✓ Branch 5 taken 1232 times.
✓ Branch 6 taken 1040 times.
✓ Branch 7 taken 1232 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 320 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 320 times.
✓ Branch 12 taken 1040 times.
✓ Branch 13 taken 1232 times.
|
10432 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
| 114 | 5888 | disable_loop_vectorization(); | |
| 115 | 5888 | process_two_element_vertically_with_or_without_horizontal_borders( | |
| 116 | 5888 | src_rows, dst_rows, window_row_offsets_0, window_row_offsets_1, | |
| 117 | 5888 | window_col_offsets, index); | |
| 118 | 5888 | } | |
| 119 | 4544 | } | |
| 120 | |||
| 121 | private: | ||
| 122 | 87400 | void process_one_element_with_horizontal_borders( | |
| 123 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
| 124 | BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, | ||
| 125 | size_t index) const KLEIDICV_STREAMING { | ||
| 126 | 87400 | SourceType src[KSize][KSize]; | |
| 127 | |||
| 128 | 87400 | auto KernelWindow = | |
| 129 | 41533720 | [&](size_t row, size_t col) | |
| 130 | 41446320 | KLEIDICV_STREAMING -> SourceType& { return src[row][col]; }; | |
| 131 | |||
| 132 | 3552016 | auto load_array_element = [&](const SourceType& x) | |
| 133 | 3464616 | KLEIDICV_STREAMING { return x; }; | |
| 134 | |||
| 135 | 174800 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
| 136 | 87400 | window_row_offsets, window_col_offsets, | |
| 137 | 87400 | index); | |
| 138 | |||
| 139 | 87400 | filter_.scalar_path(KernelWindow, dst_rows[index]); | |
| 140 | 87400 | } | |
| 141 | |||
| 142 | 8056 | void process_two_element_vertically_with_or_without_horizontal_borders( | |
| 143 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
| 144 | BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, | ||
| 145 | BorderOffsets window_col_offsets, size_t index) const { | ||
| 146 | 8056 | SourceType src[KSize + 1][KSize]; | |
| 147 | 459192 | auto KernelWindow = [&](size_t row, size_t col) -> SourceType& { | |
| 148 | 451136 | return src[row][col]; | |
| 149 | }; | ||
| 150 | 104728 | auto load_array_element = [](const SourceType& x) { return x; }; | |
| 151 | 8056 | WindowLoaderType::load_window_to_handle_dual_rows( | |
| 152 | 8056 | KernelWindow, load_array_element, src_rows, window_row_offsets_0, | |
| 153 | 8056 | window_row_offsets_1, window_col_offsets, index); | |
| 154 | |||
| 155 | 8056 | filter_.scalar_path_for_dual_row_handling( | |
| 156 | 8056 | KernelWindow, dst_rows.at(0, 0)[index], dst_rows.at(1, 0)[index]); | |
| 157 | 8056 | } | |
| 158 | |||
| 159 | InnerFilterType filter_; | ||
| 160 | }; | ||
| 161 | |||
| 162 | template <typename InnerFilterType> | ||
| 163 | using Filter2D3x3 = | ||
| 164 | Filter2d<InnerFilterType, 3UL, | ||
| 165 | Filter2dWindowLoader3x3<typename InnerFilterType::SourceType>>; | ||
| 166 | |||
| 167 | template <typename InnerFilterType> | ||
| 168 | using Filter2D5x5 = | ||
| 169 | Filter2d<InnerFilterType, 5UL, | ||
| 170 | Filter2dWindowLoader5x5<typename InnerFilterType::SourceType>>; | ||
| 171 | |||
| 172 | template <typename InnerFilterType> | ||
| 173 | using Filter2D7x7 = | ||
| 174 | Filter2d<InnerFilterType, 7UL, | ||
| 175 | Filter2dWindowLoader7x7<typename InnerFilterType::SourceType>>; | ||
| 176 | |||
| 177 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 178 | |||
| 179 | #endif // KLEIDICV_FILTER_2D_NEON_H | ||
| 180 |