KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_15x15_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 145 145 100.0%
Functions: 10 10 100.0%
Branches: 2 2 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H
6 #define KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H
7
8 #include "kleidicv/neon.h"
9 #include "kleidicv/workspace/border_15x15.h"
10
11 namespace KLEIDICV_TARGET_NAMESPACE {
12
13 // Template for drivers of separable NxM filters.
14 template <typename FilterType, const size_t S>
15 class SeparableFilter;
16
17 // Driver for a separable 15x15 filter.
18 template <typename FilterType>
19 class SeparableFilter<FilterType, 15UL> {
20 public:
21 using SourceType = typename FilterType::SourceType;
22 using BufferType = typename FilterType::BufferType;
23 using DestinationType = typename FilterType::DestinationType;
24 using SourceVecTraits = typename neon::VecTraits<SourceType>;
25 using SourceVectorType = typename SourceVecTraits::VectorType;
26 using BufferVecTraits = typename neon::VecTraits<BufferType>;
27 using BufferVectorType = typename BufferVecTraits::VectorType;
28 using BorderInfoType =
29 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo15x15<SourceType>;
30 using BorderType = FixedBorderType;
31 using BorderOffsets = typename BorderInfoType::Offsets;
32
33 32 explicit SeparableFilter(FilterType filter) : filter_{filter} {}
34
35 static constexpr size_t margin = 7UL;
36
37 456 void process_vertical(size_t width, Rows<const SourceType> src_rows,
38 Rows<BufferType> dst_rows,
39 BorderOffsets border_offsets) const {
40 912 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
41 456 SourceVecTraits::num_lanes()};
42
43 1464 loop.unroll_once([&](size_t index) {
44 1008 SourceVectorType src[15];
45 1008 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
46 1008 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
47 1008 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
48 1008 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
49 1008 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
50 1008 src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]);
51 1008 src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]);
52 1008 src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]);
53 1008 src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]);
54 1008 src[9] = vld1q(&src_rows.at(border_offsets.c9())[index]);
55 1008 src[10] = vld1q(&src_rows.at(border_offsets.c10())[index]);
56 1008 src[11] = vld1q(&src_rows.at(border_offsets.c11())[index]);
57 1008 src[12] = vld1q(&src_rows.at(border_offsets.c12())[index]);
58 1008 src[13] = vld1q(&src_rows.at(border_offsets.c13())[index]);
59 1008 src[14] = vld1q(&src_rows.at(border_offsets.c14())[index]);
60 1008 filter_.vertical_vector_path(src, &dst_rows[index]);
61 1008 });
62
63 2256 loop.tail([&](size_t index) {
64 1800 SourceType src[15];
65 1800 src[0] = src_rows.at(border_offsets.c0())[index];
66 1800 src[1] = src_rows.at(border_offsets.c1())[index];
67 1800 src[2] = src_rows.at(border_offsets.c2())[index];
68 1800 src[3] = src_rows.at(border_offsets.c3())[index];
69 1800 src[4] = src_rows.at(border_offsets.c4())[index];
70 1800 src[5] = src_rows.at(border_offsets.c5())[index];
71 1800 src[6] = src_rows.at(border_offsets.c6())[index];
72 1800 src[7] = src_rows.at(border_offsets.c7())[index];
73 1800 src[8] = src_rows.at(border_offsets.c8())[index];
74 1800 src[9] = src_rows.at(border_offsets.c9())[index];
75 1800 src[10] = src_rows.at(border_offsets.c10())[index];
76 1800 src[11] = src_rows.at(border_offsets.c11())[index];
77 1800 src[12] = src_rows.at(border_offsets.c12())[index];
78 1800 src[13] = src_rows.at(border_offsets.c13())[index];
79 1800 src[14] = src_rows.at(border_offsets.c14())[index];
80 1800 filter_.vertical_scalar_path(src, &dst_rows[index]);
81 1800 });
82 456 }
83
84 456 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
85 Rows<DestinationType> dst_rows,
86 BorderOffsets border_offsets) const {
87 912 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
88 456 BufferVecTraits::num_lanes()};
89
90 568 loop.unroll_twice([&](size_t index) {
91 112 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
92 112 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
93 112 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
94 112 auto src_3 = &src_rows.at(0, border_offsets.c3())[index];
95 112 auto src_4 = &src_rows.at(0, border_offsets.c4())[index];
96 112 auto src_5 = &src_rows.at(0, border_offsets.c5())[index];
97 112 auto src_6 = &src_rows.at(0, border_offsets.c6())[index];
98 112 auto src_7 = &src_rows.at(0, border_offsets.c7())[index];
99 112 auto src_8 = &src_rows.at(0, border_offsets.c8())[index];
100 112 auto src_9 = &src_rows.at(0, border_offsets.c9())[index];
101 112 auto src_10 = &src_rows.at(0, border_offsets.c10())[index];
102 112 auto src_11 = &src_rows.at(0, border_offsets.c11())[index];
103 112 auto src_12 = &src_rows.at(0, border_offsets.c12())[index];
104 112 auto src_13 = &src_rows.at(0, border_offsets.c13())[index];
105 112 auto src_14 = &src_rows.at(0, border_offsets.c14())[index];
106
107 112 BufferVectorType src_a[15], src_b[15];
108 112 src_a[0] = vld1q(&src_0[0]);
109 112 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
110 112 src_a[1] = vld1q(&src_1[0]);
111 112 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
112 112 src_a[2] = vld1q(&src_2[0]);
113 112 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
114 112 src_a[3] = vld1q(&src_3[0]);
115 112 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
116 112 src_a[4] = vld1q(&src_4[0]);
117 112 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
118 112 src_a[5] = vld1q(&src_5[0]);
119 112 src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]);
120 112 src_a[6] = vld1q(&src_6[0]);
121 112 src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]);
122 112 src_a[7] = vld1q(&src_7[0]);
123 112 src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]);
124 112 src_a[8] = vld1q(&src_8[0]);
125 112 src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]);
126 112 src_a[9] = vld1q(&src_9[0]);
127 112 src_b[9] = vld1q(&src_9[BufferVecTraits::num_lanes()]);
128 112 src_a[10] = vld1q(&src_10[0]);
129 112 src_b[10] = vld1q(&src_10[BufferVecTraits::num_lanes()]);
130 112 src_a[11] = vld1q(&src_11[0]);
131 112 src_b[11] = vld1q(&src_11[BufferVecTraits::num_lanes()]);
132 112 src_a[12] = vld1q(&src_12[0]);
133 112 src_b[12] = vld1q(&src_12[BufferVecTraits::num_lanes()]);
134 112 src_a[13] = vld1q(&src_13[0]);
135 112 src_b[13] = vld1q(&src_13[BufferVecTraits::num_lanes()]);
136 112 src_a[14] = vld1q(&src_14[0]);
137 112 src_b[14] = vld1q(&src_14[BufferVecTraits::num_lanes()]);
138
139 112 filter_.horizontal_vector_path(src_a, &dst_rows[index]);
140 224 filter_.horizontal_vector_path(
141 112 src_b, &dst_rows[index + BufferVecTraits::num_lanes()]);
142 112 });
143
144 568 loop.unroll_once([&](size_t index) {
145 112 BufferVectorType src[15];
146 112 src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]);
147 112 src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]);
148 112 src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]);
149 112 src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]);
150 112 src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]);
151 112 src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]);
152 112 src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]);
153 112 src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]);
154 112 src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]);
155 112 src[9] = vld1q(&src_rows.at(0, border_offsets.c9())[index]);
156 112 src[10] = vld1q(&src_rows.at(0, border_offsets.c10())[index]);
157 112 src[11] = vld1q(&src_rows.at(0, border_offsets.c11())[index]);
158 112 src[12] = vld1q(&src_rows.at(0, border_offsets.c12())[index]);
159 112 src[13] = vld1q(&src_rows.at(0, border_offsets.c13())[index]);
160 112 src[14] = vld1q(&src_rows.at(0, border_offsets.c14())[index]);
161 112 filter_.horizontal_vector_path(src, &dst_rows[index]);
162 112 });
163
164 1136 loop.tail([&](size_t index) {
165 680 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
166 680 });
167 456 }
168
169 6384 void process_horizontal_borders(Rows<const BufferType> src_rows,
170 Rows<DestinationType> dst_rows,
171 BorderOffsets border_offsets) const {
172
2/2
✓ Branch 0 taken 6384 times.
✓ Branch 1 taken 11088 times.
17472 for (size_t index = 0; index < src_rows.channels(); ++index) {
173 11088 disable_loop_vectorization();
174 11088 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
175 11088 }
176 6384 }
177
178 private:
179 11768 void process_horizontal_scalar(Rows<const BufferType> src_rows,
180 Rows<DestinationType> dst_rows,
181 BorderOffsets border_offsets,
182 size_t index) const {
183 11768 BufferType src[15];
184 11768 src[0] = src_rows.at(0, border_offsets.c0())[index];
185 11768 src[1] = src_rows.at(0, border_offsets.c1())[index];
186 11768 src[2] = src_rows.at(0, border_offsets.c2())[index];
187 11768 src[3] = src_rows.at(0, border_offsets.c3())[index];
188 11768 src[4] = src_rows.at(0, border_offsets.c4())[index];
189 11768 src[5] = src_rows.at(0, border_offsets.c5())[index];
190 11768 src[6] = src_rows.at(0, border_offsets.c6())[index];
191 11768 src[7] = src_rows.at(0, border_offsets.c7())[index];
192 11768 src[8] = src_rows.at(0, border_offsets.c8())[index];
193 11768 src[9] = src_rows.at(0, border_offsets.c9())[index];
194 11768 src[10] = src_rows.at(0, border_offsets.c10())[index];
195 11768 src[11] = src_rows.at(0, border_offsets.c11())[index];
196 11768 src[12] = src_rows.at(0, border_offsets.c12())[index];
197 11768 src[13] = src_rows.at(0, border_offsets.c13())[index];
198 11768 src[14] = src_rows.at(0, border_offsets.c14())[index];
199 11768 filter_.horizontal_scalar_path(src, &dst_rows[index]);
200 11768 }
201
202 FilterType filter_;
203 }; // end of class SeparableFilter<FilterType, 15UL>
204
205 // Shorthand for 15x15 separable filters driver type.
206 template <class FilterType>
207 using SeparableFilter15x15 = SeparableFilter<FilterType, 15UL>;
208
209 } // namespace KLEIDICV_TARGET_NAMESPACE
210
211 #endif // KLEIDICV_SEPARABLE_FILTER_15X15_NEON_H
212