Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_FILTER_2D_NEON_H | ||
6 | #define KLEIDICV_FILTER_2D_NEON_H | ||
7 | |||
8 | #include "filter_2d_window_loader_3x3.h" | ||
9 | #include "filter_2d_window_loader_5x5.h" | ||
10 | #include "filter_2d_window_loader_7x7.h" | ||
11 | #include "kleidicv/neon.h" | ||
12 | #include "process_filter_2d.h" | ||
13 | |||
14 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
15 | |||
16 | template <typename InnerFilterType, size_t KSize, typename WindowLoaderType> | ||
17 | class Filter2d { | ||
18 | public: | ||
19 | using SourceType = typename InnerFilterType::SourceType; | ||
20 | using DestinationType = typename InnerFilterType::DestinationType; | ||
21 | using SourceVecTraits = typename neon::VecTraits<SourceType>; | ||
22 | using DestinationVecTraits = typename neon::VecTraits<DestinationType>; | ||
23 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
24 | using DestinationVectorType = typename DestinationVecTraits::VectorType; | ||
25 | using BorderType = FixedBorderType; | ||
26 | static constexpr size_t kMargin = KSize / 2; | ||
27 | using BorderInfoType = | ||
28 | typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo<SourceType, KSize>; | ||
29 | using BorderOffsets = typename BorderInfoType::Offsets; | ||
30 | |||
31 | 1610 | explicit Filter2d(InnerFilterType filter) : filter_{filter} {} | |
32 | |||
33 | 8676 | void process_pixels_without_horizontal_borders( | |
34 | size_t width, Rows<const SourceType> src_rows, | ||
35 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets, | ||
36 | BorderOffsets window_col_offsets) const { | ||
37 | 17352 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
38 | 8676 | SourceVecTraits::num_lanes()}; | |
39 | |||
40 | 34632 | loop.unroll_once([&](size_t index) { | |
41 | 25956 | SourceVectorType src[KSize][KSize]; | |
42 | 25956 | DestinationVectorType dst_vec; | |
43 | |||
44 | 11224924 | auto KernelWindow = [&](size_t row, size_t col) -> SourceVectorType& { | |
45 | 11198968 | return src[row][col]; | |
46 | }; | ||
47 | |||
48 | 983368 | auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; | |
49 | 51912 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
50 | 25956 | window_row_offsets, window_col_offsets, | |
51 | 25956 | index); | |
52 | |||
53 | 25956 | filter_.vector_path(KernelWindow, dst_vec); | |
54 | 25956 | vst1q(&dst_rows[index], dst_vec); | |
55 | 25956 | }); | |
56 | |||
57 | 13556 | loop.tail([&](size_t index) { | |
58 | 4880 | process_one_element_with_horizontal_borders( | |
59 | 4880 | src_rows, dst_rows, window_row_offsets, window_col_offsets, index); | |
60 | 4880 | }); | |
61 | 8676 | } | |
62 | |||
63 | 2272 | void process_pixels_of_dual_rows_without_horizontal_borders( | |
64 | size_t width, Rows<const SourceType> src_rows, | ||
65 | Rows<DestinationType> dst_rows, BorderOffsets window_row_offsets_0, | ||
66 | BorderOffsets window_row_offsets_1, | ||
67 | BorderOffsets window_col_offsets) const { | ||
68 | 4544 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
69 | 2272 | SourceVecTraits::num_lanes()}; | |
70 | |||
71 | 5750 | loop.unroll_once([&](size_t index) { | |
72 | 3478 | SourceVectorType src[KSize + 1][KSize]; | |
73 | 3478 | DestinationVectorType dst_vec_0; | |
74 | 3478 | DestinationVectorType dst_vec_1; | |
75 | 198246 | auto KernelWindow = [&](size_t row, size_t col) -> SourceVectorType& { | |
76 | 194768 | return src[row][col]; | |
77 | }; | ||
78 | |||
79 | 45214 | auto load_array_element = [](const SourceType& x) { return vld1q(&x); }; | |
80 | 3478 | WindowLoaderType::load_window_to_handle_dual_rows( | |
81 | 3478 | KernelWindow, load_array_element, src_rows, window_row_offsets_0, | |
82 | 3478 | window_row_offsets_1, window_col_offsets, index); | |
83 | |||
84 | 3478 | filter_.vector_path_for_dual_row_handling(KernelWindow, dst_vec_0, | |
85 | dst_vec_1); | ||
86 | 3478 | vst1q(&dst_rows.at(0, 0)[index], dst_vec_0); | |
87 | 3478 | vst1q(&dst_rows.at(1, 0)[index], dst_vec_1); | |
88 | 3478 | }); | |
89 | |||
90 | 4440 | loop.tail([&](size_t index) { | |
91 | 2168 | process_two_element_vertically_with_or_without_horizontal_borders( | |
92 | 2168 | src_rows, dst_rows, window_row_offsets_0, window_row_offsets_1, | |
93 | 2168 | window_col_offsets, index); | |
94 | 2168 | }); | |
95 | 2272 | } | |
96 | |||
97 | 42704 | void process_one_pixel_with_horizontal_borders( | |
98 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
99 | BorderOffsets window_row_offsets, | ||
100 | BorderOffsets window_col_offsets) const KLEIDICV_STREAMING { | ||
101 |
42/42✓ Branch 0 taken 64 times.
✓ Branch 1 taken 160 times.
✓ Branch 2 taken 1232 times.
✓ Branch 3 taken 3080 times.
✓ Branch 4 taken 2496 times.
✓ Branch 5 taken 6240 times.
✓ Branch 6 taken 72 times.
✓ Branch 7 taken 168 times.
✓ Branch 8 taken 3272 times.
✓ Branch 9 taken 5120 times.
✓ Branch 10 taken 4488 times.
✓ Branch 11 taken 8232 times.
✓ Branch 12 taken 72 times.
✓ Branch 13 taken 168 times.
✓ Branch 14 taken 3272 times.
✓ Branch 15 taken 5120 times.
✓ Branch 16 taken 4488 times.
✓ Branch 17 taken 8232 times.
✓ Branch 18 taken 72 times.
✓ Branch 19 taken 168 times.
✓ Branch 20 taken 3272 times.
✓ Branch 21 taken 5120 times.
✓ Branch 22 taken 4488 times.
✓ Branch 23 taken 8232 times.
✓ Branch 24 taken 64 times.
✓ Branch 25 taken 160 times.
✓ Branch 26 taken 1232 times.
✓ Branch 27 taken 3080 times.
✓ Branch 28 taken 2496 times.
✓ Branch 29 taken 6240 times.
✓ Branch 30 taken 64 times.
✓ Branch 31 taken 160 times.
✓ Branch 32 taken 1232 times.
✓ Branch 33 taken 3080 times.
✓ Branch 34 taken 2496 times.
✓ Branch 35 taken 6240 times.
✓ Branch 36 taken 72 times.
✓ Branch 37 taken 168 times.
✓ Branch 38 taken 3272 times.
✓ Branch 39 taken 5120 times.
✓ Branch 40 taken 4488 times.
✓ Branch 41 taken 8232 times.
|
125224 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
102 | 82520 | disable_loop_vectorization(); | |
103 | 82520 | process_one_element_with_horizontal_borders( | |
104 | 82520 | src_rows, dst_rows, window_row_offsets, window_col_offsets, index); | |
105 | 82520 | } | |
106 | 42704 | } | |
107 | |||
108 | // Processes two vertically adjacent pixels in a single column | ||
109 | 4544 | void process_two_pixels_with_horizontal_borders( | |
110 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
111 | BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, | ||
112 | BorderOffsets window_col_offsets) const { | ||
113 |
14/14✓ Branch 0 taken 128 times.
✓ Branch 1 taken 320 times.
✓ Branch 2 taken 1040 times.
✓ Branch 3 taken 1232 times.
✓ Branch 4 taken 1040 times.
✓ Branch 5 taken 1232 times.
✓ Branch 6 taken 1040 times.
✓ Branch 7 taken 1232 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 320 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 320 times.
✓ Branch 12 taken 1040 times.
✓ Branch 13 taken 1232 times.
|
10432 | for (size_t index = 0; index < src_rows.channels(); ++index) { |
114 | 5888 | disable_loop_vectorization(); | |
115 | 5888 | process_two_element_vertically_with_or_without_horizontal_borders( | |
116 | 5888 | src_rows, dst_rows, window_row_offsets_0, window_row_offsets_1, | |
117 | 5888 | window_col_offsets, index); | |
118 | 5888 | } | |
119 | 4544 | } | |
120 | |||
121 | private: | ||
122 | 87400 | void process_one_element_with_horizontal_borders( | |
123 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
124 | BorderOffsets window_row_offsets, BorderOffsets window_col_offsets, | ||
125 | size_t index) const KLEIDICV_STREAMING { | ||
126 | 87400 | SourceType src[KSize][KSize]; | |
127 | |||
128 | 87400 | auto KernelWindow = | |
129 | 41533720 | [&](size_t row, size_t col) | |
130 | 41446320 | KLEIDICV_STREAMING -> SourceType& { return src[row][col]; }; | |
131 | |||
132 | 3552016 | auto load_array_element = [&](const SourceType& x) | |
133 | 3464616 | KLEIDICV_STREAMING { return x; }; | |
134 | |||
135 | 174800 | WindowLoaderType::load_window(KernelWindow, load_array_element, src_rows, | |
136 | 87400 | window_row_offsets, window_col_offsets, | |
137 | 87400 | index); | |
138 | |||
139 | 87400 | filter_.scalar_path(KernelWindow, dst_rows[index]); | |
140 | 87400 | } | |
141 | |||
142 | 8056 | void process_two_element_vertically_with_or_without_horizontal_borders( | |
143 | Rows<const SourceType> src_rows, Rows<DestinationType> dst_rows, | ||
144 | BorderOffsets window_row_offsets_0, BorderOffsets window_row_offsets_1, | ||
145 | BorderOffsets window_col_offsets, size_t index) const { | ||
146 | 8056 | SourceType src[KSize + 1][KSize]; | |
147 | 459192 | auto KernelWindow = [&](size_t row, size_t col) -> SourceType& { | |
148 | 451136 | return src[row][col]; | |
149 | }; | ||
150 | 104728 | auto load_array_element = [](const SourceType& x) { return x; }; | |
151 | 8056 | WindowLoaderType::load_window_to_handle_dual_rows( | |
152 | 8056 | KernelWindow, load_array_element, src_rows, window_row_offsets_0, | |
153 | 8056 | window_row_offsets_1, window_col_offsets, index); | |
154 | |||
155 | 8056 | filter_.scalar_path_for_dual_row_handling( | |
156 | 8056 | KernelWindow, dst_rows.at(0, 0)[index], dst_rows.at(1, 0)[index]); | |
157 | 8056 | } | |
158 | |||
159 | InnerFilterType filter_; | ||
160 | }; | ||
161 | |||
162 | template <typename InnerFilterType> | ||
163 | using Filter2D3x3 = | ||
164 | Filter2d<InnerFilterType, 3UL, | ||
165 | Filter2dWindowLoader3x3<typename InnerFilterType::SourceType>>; | ||
166 | |||
167 | template <typename InnerFilterType> | ||
168 | using Filter2D5x5 = | ||
169 | Filter2d<InnerFilterType, 5UL, | ||
170 | Filter2dWindowLoader5x5<typename InnerFilterType::SourceType>>; | ||
171 | |||
172 | template <typename InnerFilterType> | ||
173 | using Filter2D7x7 = | ||
174 | Filter2d<InnerFilterType, 7UL, | ||
175 | Filter2dWindowLoader7x7<typename InnerFilterType::SourceType>>; | ||
176 | |||
177 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
178 | |||
179 | #endif // KLEIDICV_FILTER_2D_NEON_H | ||
180 |