KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_9x9_neon.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 103 103 100.0%
Functions: 20 20 100.0%
Branches: 4 4 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_9X9_NEON_H
6 #define KLEIDICV_SEPARABLE_FILTER_9X9_NEON_H
7
8 #include "kleidicv/neon.h"
9 #include "kleidicv/workspace/border_9x9.h"
10
11 namespace KLEIDICV_TARGET_NAMESPACE {
12
13 // Template for drivers of separable NxM filters.
14 template <typename FilterType, const size_t S>
15 class SeparableFilter;
16
17 // Driver for a separable 9x9 filter.
18 template <typename FilterType>
19 class SeparableFilter<FilterType, 9UL> {
20 public:
21 using SourceType = typename FilterType::SourceType;
22 using BufferType = typename FilterType::BufferType;
23 using DestinationType = typename FilterType::DestinationType;
24 using SourceVecTraits = typename neon::VecTraits<SourceType>;
25 using SourceVectorType = typename SourceVecTraits::VectorType;
26 using BufferVecTraits = typename neon::VecTraits<BufferType>;
27 using BufferVectorType = typename BufferVecTraits::VectorType;
28 using BorderInfoType =
29 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo9x9<SourceType>;
30 using BorderType = FixedBorderType;
31 using BorderOffsets = typename BorderInfoType::Offsets;
32
33 68 explicit SeparableFilter(FilterType filter) : filter_{filter} {}
34
35 static constexpr size_t margin = 4UL;
36
37 612 void process_vertical(size_t width, Rows<const SourceType> src_rows,
38 Rows<BufferType> dst_rows,
39 BorderOffsets border_offsets) const {
40 1224 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
41 612 SourceVecTraits::num_lanes()};
42
43 1424 loop.unroll_once([&](size_t index) {
44 812 SourceVectorType src[9];
45 812 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
46 812 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
47 812 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
48 812 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
49 812 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
50 812 src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]);
51 812 src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]);
52 812 src[7] = vld1q(&src_rows.at(border_offsets.c7())[index]);
53 812 src[8] = vld1q(&src_rows.at(border_offsets.c8())[index]);
54 812 filter_.vertical_vector_path(src, &dst_rows[index]);
55 812 });
56
57 3276 loop.tail([&](size_t index) {
58 2664 SourceType src[9];
59 2664 src[0] = src_rows.at(border_offsets.c0())[index];
60 2664 src[1] = src_rows.at(border_offsets.c1())[index];
61 2664 src[2] = src_rows.at(border_offsets.c2())[index];
62 2664 src[3] = src_rows.at(border_offsets.c3())[index];
63 2664 src[4] = src_rows.at(border_offsets.c4())[index];
64 2664 src[5] = src_rows.at(border_offsets.c5())[index];
65 2664 src[6] = src_rows.at(border_offsets.c6())[index];
66 2664 src[7] = src_rows.at(border_offsets.c7())[index];
67 2664 src[8] = src_rows.at(border_offsets.c8())[index];
68 2664 filter_.vertical_scalar_path(src, &dst_rows[index]);
69 2664 });
70 612 }
71
72 612 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
73 Rows<DestinationType> dst_rows,
74 BorderOffsets border_offsets) const {
75 1224 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
76 612 BufferVecTraits::num_lanes()};
77
78 812 loop.unroll_twice([&](size_t index) {
79 200 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
80 200 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
81 200 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
82 200 auto src_3 = &src_rows.at(0, border_offsets.c3())[index];
83 200 auto src_4 = &src_rows.at(0, border_offsets.c4())[index];
84 200 auto src_5 = &src_rows.at(0, border_offsets.c5())[index];
85 200 auto src_6 = &src_rows.at(0, border_offsets.c6())[index];
86 200 auto src_7 = &src_rows.at(0, border_offsets.c7())[index];
87 200 auto src_8 = &src_rows.at(0, border_offsets.c8())[index];
88
89 200 BufferVectorType src_a[9], src_b[9];
90 200 src_a[0] = vld1q(&src_0[0]);
91 200 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
92 200 src_a[1] = vld1q(&src_1[0]);
93 200 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
94 200 src_a[2] = vld1q(&src_2[0]);
95 200 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
96 200 src_a[3] = vld1q(&src_3[0]);
97 200 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
98 200 src_a[4] = vld1q(&src_4[0]);
99 200 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
100 200 src_a[5] = vld1q(&src_5[0]);
101 200 src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]);
102 200 src_a[6] = vld1q(&src_6[0]);
103 200 src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]);
104 200 src_a[7] = vld1q(&src_7[0]);
105 200 src_b[7] = vld1q(&src_7[BufferVecTraits::num_lanes()]);
106 200 src_a[8] = vld1q(&src_8[0]);
107 200 src_b[8] = vld1q(&src_8[BufferVecTraits::num_lanes()]);
108
109 200 filter_.horizontal_vector_path(src_a, &dst_rows[index]);
110 400 filter_.horizontal_vector_path(
111 200 src_b, &dst_rows[index + BufferVecTraits::num_lanes()]);
112 200 });
113
114 836 loop.unroll_once([&](size_t index) {
115 224 BufferVectorType src[9];
116 224 src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]);
117 224 src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]);
118 224 src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]);
119 224 src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]);
120 224 src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]);
121 224 src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]);
122 224 src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]);
123 224 src[7] = vld1q(&src_rows.at(0, border_offsets.c7())[index]);
124 224 src[8] = vld1q(&src_rows.at(0, border_offsets.c8())[index]);
125 224 filter_.horizontal_vector_path(src, &dst_rows[index]);
126 224 });
127
128 1388 loop.tail([&](size_t index) {
129 776 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
130 776 });
131 612 }
132
133 4896 void process_horizontal_borders(Rows<const BufferType> src_rows,
134 Rows<DestinationType> dst_rows,
135 BorderOffsets border_offsets) const {
136
4/4
✓ Branch 0 taken 3776 times.
✓ Branch 1 taken 4544 times.
✓ Branch 2 taken 1120 times.
✓ Branch 3 taken 1888 times.
11328 for (size_t index = 0; index < src_rows.channels(); ++index) {
137 6432 disable_loop_vectorization();
138 6432 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
139 6432 }
140 4896 }
141
142 private:
143 7208 void process_horizontal_scalar(Rows<const BufferType> src_rows,
144 Rows<DestinationType> dst_rows,
145 BorderOffsets border_offsets,
146 size_t index) const {
147 7208 BufferType src[9];
148 7208 src[0] = src_rows.at(0, border_offsets.c0())[index];
149 7208 src[1] = src_rows.at(0, border_offsets.c1())[index];
150 7208 src[2] = src_rows.at(0, border_offsets.c2())[index];
151 7208 src[3] = src_rows.at(0, border_offsets.c3())[index];
152 7208 src[4] = src_rows.at(0, border_offsets.c4())[index];
153 7208 src[5] = src_rows.at(0, border_offsets.c5())[index];
154 7208 src[6] = src_rows.at(0, border_offsets.c6())[index];
155 7208 src[7] = src_rows.at(0, border_offsets.c7())[index];
156 7208 src[8] = src_rows.at(0, border_offsets.c8())[index];
157 7208 filter_.horizontal_scalar_path(src, &dst_rows[index]);
158 7208 }
159
160 FilterType filter_;
161 }; // end of class SeparableFilter<FilterType, 9UL>
162
163 // Shorthand for 9x9 separable filters driver type.
164 template <class FilterType>
165 using SeparableFilter9x9 = SeparableFilter<FilterType, 9UL>;
166
167 } // namespace KLEIDICV_TARGET_NAMESPACE
168
169 #endif // KLEIDICV_SEPARABLE_FILTER_9X9_NEON_H
170