KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_21x21_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 162 162 100.0%
Functions: 9 9 100.0%
Branches: 2 2 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H
6 #define KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H
7
8 #include "kleidicv/neon.h"
9 #include "kleidicv/workspace/border_21x21.h"
10
11 namespace KLEIDICV_TARGET_NAMESPACE {
12
13 // Template for drivers of separable NxM filters.
14 template <typename FilterType, const size_t S>
15 class SeparableFilter;
16
17 // Driver for a separable 21x21 filter.
18 template <typename FilterType>
19 class SeparableFilter<FilterType, 21UL> {
20 public:
21 using SourceType = typename FilterType::SourceType;
22 using BufferType = typename FilterType::BufferType;
23 using DestinationType = typename FilterType::DestinationType;
24 using SourceVecTraits = typename neon::VecTraits<SourceType>;
25 using SourceVectorType = typename SourceVecTraits::VectorType;
26 using BufferVecTraits = typename neon::VecTraits<BufferType>;
27 using BufferVectorType = typename BufferVecTraits::VectorType;
28 using BorderInfoType =
29 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo21x21<SourceType>;
30 using BorderType = FixedBorderType;
31 using BorderOffsets = typename BorderInfoType::Offsets;
32
33 32 explicit SeparableFilter(FilterType filter) : filter_{filter} {}
34
35 static constexpr size_t margin = 10UL;
36
37 648 void process_vertical(size_t width, Rows<const SourceType> src_rows,
38 Rows<BufferType> dst_rows,
39 BorderOffsets border_offsets) const {
40 1296 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
41 648 SourceVecTraits::num_lanes()};
42
43 2904 loop.unroll_once([&](size_t index) {
44 2256 SourceVectorType src[21];
45 2256 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
46 2256 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
47 2256 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
48 2256 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
49 2256 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
50 2256 src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]);
51 2256 src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]);
52 2256 src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]);
53 2256 src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]);
54 2256 src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]);
55 2256 src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]);
56 2256 src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]);
57 2256 src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]);
58 2256 src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]);
59 2256 src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]);
60 2256 src[15] = vld1q(&src_rows.at(border_offsets.c15())[index]);
61 2256 src[16] = vld1q(&src_rows.at(border_offsets.c16())[index]);
62 2256 src[17] = vld1q(&src_rows.at(border_offsets.c17())[index]);
63 2256 src[18] = vld1q(&src_rows.at(border_offsets.c18())[index]);
64 2256 src[19] = vld1q(&src_rows.at(border_offsets.c19())[index]);
65 2256 src[20] = vld1q(&src_rows.at(border_offsets.c20())[index]);
66 2256 filter_.vertical_vector_path(src, &dst_rows[index]);
67 2256 });
68
69 // No tail path needed in NEON, because TryToAvoidTailPath works for any
70 // supported size (i.e. the minimum size is kernel_size - 1, which is 20,
71 // and the NEON vector length is 16 which is smaller than that).
72 648 }
73
74 648 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
75 Rows<DestinationType> dst_rows,
76 BorderOffsets border_offsets) const {
77 1296 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
78 648 BufferVecTraits::num_lanes()};
79
80 808 loop.unroll_twice([&](size_t index) {
81 160 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
82 160 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
83 160 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
84 160 auto src_3 = &src_rows.at(0, border_offsets.c3())[index];
85 160 auto src_4 = &src_rows.at(0, border_offsets.c4())[index];
86 160 auto src_5 = &src_rows.at(0, border_offsets.c5())[index];
87 160 auto src_6 = &src_rows.at(0, border_offsets.c6())[index];
88 160 auto src_7 = &src_rows.at(0, border_offsets.c7())[index];
89 160 auto src_8 = &src_rows.at(0, border_offsets.c8())[index];
90 160 auto src_9 = &src_rows.at(0, border_offsets.c9())[index];
91 160 auto src_10 = &src_rows.at(0, border_offsets.c10())[index];
92 160 auto src_11 = &src_rows.at(0, border_offsets.c11())[index];
93 160 auto src_12 = &src_rows.at(0, border_offsets.c12())[index];
94 160 auto src_13 = &src_rows.at(0, border_offsets.c13())[index];
95 160 auto src_14 = &src_rows.at(0, border_offsets.c14())[index];
96 160 auto src_15 = &src_rows.at(0, border_offsets.c15())[index];
97 160 auto src_16 = &src_rows.at(0, border_offsets.c16())[index];
98 160 auto src_17 = &src_rows.at(0, border_offsets.c17())[index];
99 160 auto src_18 = &src_rows.at(0, border_offsets.c18())[index];
100 160 auto src_19 = &src_rows.at(0, border_offsets.c19())[index];
101 160 auto src_20 = &src_rows.at(0, border_offsets.c20())[index];
102
103 160 BufferVectorType src_a[21], src_b[21];
104 160 src_a[0] = vld1q(&src_0[0]);
105 160 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
106 160 src_a[1] = vld1q(&src_1[0]);
107 160 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
108 160 src_a[2] = vld1q(&src_2[0]);
109 160 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
110 160 src_a[3] = vld1q(&src_3[0]);
111 160 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
112 160 src_a[4] = vld1q(&src_4[0]);
113 160 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
114 160 src_a[5] = vld1q(&src_5[0]);
115 160 src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]);
116 160 src_a[6] = vld1q(&src_6[0]);
117 160 src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]);
118 160 src_a[7] = vld1q(&src_7[0]);
119 160 src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]);
120 160 src_a[8] = vld1q(&src_8[0]);
121 160 src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]);
122 160 src_a[9] = vld1q(&src_9[0]);
123 160 src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]);
124 160 src_a[10] = vld1q(&src_10[0]);
125 160 src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]);
126 160 src_a[11] = vld1q(&src_11[0]);
127 160 src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]);
128 160 src_a[12] = vld1q(&src_12[0]);
129 160 src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]);
130 160 src_a[13] = vld1q(&src_13[0]);
131 160 src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]);
132 160 src_a[14] = vld1q(&src_14[0]);
133 160 src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]);
134 160 src_a[15] = vld1q(&src_15[0]);
135 160 src_b[15] = vld1q(&src_15[BufferVecTraits::num_lanes()]);
136 160 src_a[16] = vld1q(&src_16[0]);
137 160 src_b[16] = vld1q(&src_16[BufferVecTraits::num_lanes()]);
138 160 src_a[17] = vld1q(&src_17[0]);
139 160 src_b[17] = vld1q(&src_17[BufferVecTraits::num_lanes()]);
140 160 src_a[18] = vld1q(&src_18[0]);
141 160 src_b[18] = vld1q(&src_18[BufferVecTraits::num_lanes()]);
142 160 src_a[19] = vld1q(&src_19[0]);
143 160 src_b[19] = vld1q(&src_19[BufferVecTraits::num_lanes()]);
144 160 src_a[20] = vld1q(&src_20[0]);
145 160 src_b[20] = vld1q(&src_20[BufferVecTraits::num_lanes()]);
146
147 160 filter_.horizontal_vector_path(src_a, &dst_rows[index]);
148 320 filter_.horizontal_vector_path(
149 160 src_b, &dst_rows[index + BufferVecTraits::num_lanes()]);
150 160 });
151
152 808 loop.unroll_once([&](size_t index) {
153 160 BufferVectorType src[21];
154 160 src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]);
155 160 src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]);
156 160 src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]);
157 160 src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]);
158 160 src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]);
159 160 src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]);
160 160 src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]);
161 160 src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]);
162 160 src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]);
163 160 src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]);
164 160 src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]);
165 160 src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]);
166 160 src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]);
167 160 src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]);
168 160 src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]);
169 160 src[15] = vld1q(&src_rows.at(0, border_offsets.c15())[index]);
170 160 src[16] = vld1q(&src_rows.at(0, border_offsets.c16())[index]);
171 160 src[17] = vld1q(&src_rows.at(0, border_offsets.c17())[index]);
172 160 src[18] = vld1q(&src_rows.at(0, border_offsets.c18())[index]);
173 160 src[19] = vld1q(&src_rows.at(0, border_offsets.c19())[index]);
174 160 src[20] = vld1q(&src_rows.at(0, border_offsets.c20())[index]);
175 160 filter_.horizontal_vector_path(src, &dst_rows[index]);
176 160 });
177
178 1616 loop.tail([&](size_t index) {
179 968 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
180 968 });
181 648 }
182
183 12960 void process_horizontal_borders(Rows<const BufferType> src_rows,
184 Rows<DestinationType> dst_rows,
185 BorderOffsets border_offsets) const {
186
2/2
✓ Branch 0 taken 12960 times.
✓ Branch 1 taken 22560 times.
35520 for (size_t index = 0; index < src_rows.channels(); ++index) {
187 22560 disable_loop_vectorization();
188 22560 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
189 22560 }
190 12960 }
191
192 private:
193 23528 void process_horizontal_scalar(Rows<const BufferType> src_rows,
194 Rows<DestinationType> dst_rows,
195 BorderOffsets border_offsets,
196 size_t index) const {
197 23528 BufferType src[21];
198 23528 src[0] = src_rows.at(0, border_offsets.c0())[index];
199 23528 src[1] = src_rows.at(0, border_offsets.c1())[index];
200 23528 src[2] = src_rows.at(0, border_offsets.c2())[index];
201 23528 src[3] = src_rows.at(0, border_offsets.c3())[index];
202 23528 src[4] = src_rows.at(0, border_offsets.c4())[index];
203 23528 src[5] = src_rows.at(0, border_offsets.c5())[index];
204 23528 src[6] = src_rows.at(0, border_offsets.c6())[index];
205 23528 src[7] = src_rows.at(0, border_offsets.c7())[index];
206 23528 src[8] = src_rows.at(0, border_offsets.c8())[index];
207 23528 src[9] = src_rows.at(0, border_offsets.c9())[index];
208 23528 src[10] = src_rows.at(0, border_offsets.c10())[index];
209 23528 src[11] = src_rows.at(0, border_offsets.c11())[index];
210 23528 src[12] = src_rows.at(0, border_offsets.c12())[index];
211 23528 src[13] = src_rows.at(0, border_offsets.c13())[index];
212 23528 src[14] = src_rows.at(0, border_offsets.c14())[index];
213 23528 src[15] = src_rows.at(0, border_offsets.c15())[index];
214 23528 src[16] = src_rows.at(0, border_offsets.c16())[index];
215 23528 src[17] = src_rows.at(0, border_offsets.c17())[index];
216 23528 src[18] = src_rows.at(0, border_offsets.c18())[index];
217 23528 src[19] = src_rows.at(0, border_offsets.c19())[index];
218 23528 src[20] = src_rows.at(0, border_offsets.c20())[index];
219 23528 filter_.horizontal_scalar_path(src, &dst_rows[index]);
220 23528 }
221
222 FilterType filter_;
223 }; // end of class SeparableFilter<FilterType, 21UL>
224
225 // Shorthand for 21x21 separable filters driver type.
226 template <class FilterType>
227 using SeparableFilter21x21 = SeparableFilter<FilterType, 21UL>;
228
229 } // namespace KLEIDICV_TARGET_NAMESPACE
230
231 #endif // KLEIDICV_SEPARABLE_FILTER_21X21_NEON_H
232