KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_7x7_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 89 89 100.0%
Functions: 20 20 100.0%
Branches: 4 4 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H
6 #define KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H
7
8 #include "kleidicv/neon.h"
9 #include "kleidicv/workspace/border_7x7.h"
10
11 namespace KLEIDICV_TARGET_NAMESPACE {
12
13 // Template for drivers of separable NxM filters.
14 template <typename FilterType, const size_t S>
15 class SeparableFilter;
16
17 // Driver for a separable 7x7 filter.
18 template <typename FilterType>
19 class SeparableFilter<FilterType, 7UL> {
20 public:
21 using SourceType = typename FilterType::SourceType;
22 using BufferType = typename FilterType::BufferType;
23 using DestinationType = typename FilterType::DestinationType;
24 using SourceVecTraits = typename neon::VecTraits<SourceType>;
25 using SourceVectorType = typename SourceVecTraits::VectorType;
26 using BufferVecTraits = typename neon::VecTraits<BufferType>;
27 using BufferVectorType = typename BufferVecTraits::VectorType;
28 using BorderInfoType =
29 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo7x7<SourceType>;
30 using BorderType = FixedBorderType;
31 using BorderOffsets = typename BorderInfoType::Offsets;
32
33 50 explicit SeparableFilter(FilterType filter) : filter_{filter} {}
34
35 static constexpr size_t margin = 3UL;
36
37 308 void process_vertical(size_t width, Rows<const SourceType> src_rows,
38 Rows<BufferType> dst_rows,
39 BorderOffsets border_offsets) const {
40 616 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
41 308 SourceVecTraits::num_lanes()};
42
43 692 loop.unroll_once([&](size_t index) {
44 384 SourceVectorType src[7];
45 384 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
46 384 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
47 384 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
48 384 src[3] = vld1q(&src_rows.at(border_offsets.c3())[index]);
49 384 src[4] = vld1q(&src_rows.at(border_offsets.c4())[index]);
50 384 src[5] = vld1q(&src_rows.at(border_offsets.c5())[index]);
51 384 src[6] = vld1q(&src_rows.at(border_offsets.c6())[index]);
52 384 filter_.vertical_vector_path(src, &dst_rows[index]);
53 384 });
54
55 1540 loop.tail([&](size_t index) {
56 1232 SourceType src[7];
57 1232 src[0] = src_rows.at(border_offsets.c0())[index];
58 1232 src[1] = src_rows.at(border_offsets.c1())[index];
59 1232 src[2] = src_rows.at(border_offsets.c2())[index];
60 1232 src[3] = src_rows.at(border_offsets.c3())[index];
61 1232 src[4] = src_rows.at(border_offsets.c4())[index];
62 1232 src[5] = src_rows.at(border_offsets.c5())[index];
63 1232 src[6] = src_rows.at(border_offsets.c6())[index];
64 1232 filter_.vertical_scalar_path(src, &dst_rows[index]);
65 1232 });
66 308 }
67
68 308 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
69 Rows<DestinationType> dst_rows,
70 BorderOffsets border_offsets) const {
71 616 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
72 308 BufferVecTraits::num_lanes()};
73
74 356 loop.unroll_twice([&](size_t index) {
75 48 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
76 48 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
77 48 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
78 48 auto src_3 = &src_rows.at(0, border_offsets.c3())[index];
79 48 auto src_4 = &src_rows.at(0, border_offsets.c4())[index];
80 48 auto src_5 = &src_rows.at(0, border_offsets.c5())[index];
81 48 auto src_6 = &src_rows.at(0, border_offsets.c6())[index];
82
83 48 BufferVectorType src_a[7], src_b[7];
84 48 src_a[0] = vld1q(&src_0[0]);
85 48 src_b[0] = vld1q(&src_0[BufferVecTraits::num_lanes()]);
86 48 src_a[1] = vld1q(&src_1[0]);
87 48 src_b[1] = vld1q(&src_1[BufferVecTraits::num_lanes()]);
88 48 src_a[2] = vld1q(&src_2[0]);
89 48 src_b[2] = vld1q(&src_2[BufferVecTraits::num_lanes()]);
90 48 src_a[3] = vld1q(&src_3[0]);
91 48 src_b[3] = vld1q(&src_3[BufferVecTraits::num_lanes()]);
92 48 src_a[4] = vld1q(&src_4[0]);
93 48 src_b[4] = vld1q(&src_4[BufferVecTraits::num_lanes()]);
94 48 src_a[5] = vld1q(&src_5[0]);
95 48 src_b[5] = vld1q(&src_5[BufferVecTraits::num_lanes()]);
96 48 src_a[6] = vld1q(&src_6[0]);
97 48 src_b[6] = vld1q(&src_6[BufferVecTraits::num_lanes()]);
98
99 48 filter_.horizontal_vector_path(src_a, &dst_rows[index]);
100 96 filter_.horizontal_vector_path(
101 48 src_b, &dst_rows[index + BufferVecTraits::num_lanes()]);
102 48 });
103
104 500 loop.unroll_once([&](size_t index) {
105 192 BufferVectorType src[7];
106 192 src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]);
107 192 src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]);
108 192 src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]);
109 192 src[3] = vld1q(&src_rows.at(0, border_offsets.c3())[index]);
110 192 src[4] = vld1q(&src_rows.at(0, border_offsets.c4())[index]);
111 192 src[5] = vld1q(&src_rows.at(0, border_offsets.c5())[index]);
112 192 src[6] = vld1q(&src_rows.at(0, border_offsets.c6())[index]);
113 192 filter_.horizontal_vector_path(src, &dst_rows[index]);
114 192 });
115
116 484 loop.tail([&](size_t index) {
117 176 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
118 176 });
119 308 }
120
121 1848 void process_horizontal_borders(Rows<const BufferType> src_rows,
122 Rows<DestinationType> dst_rows,
123 BorderOffsets border_offsets) const {
124
4/4
✓ Branch 0 taken 1212 times.
✓ Branch 1 taken 1644 times.
✓ Branch 2 taken 636 times.
✓ Branch 3 taken 1068 times.
4560 for (size_t index = 0; index < src_rows.channels(); ++index) {
125 2712 disable_loop_vectorization();
126 2712 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
127 2712 }
128 1848 }
129
130 private:
131 2888 void process_horizontal_scalar(Rows<const BufferType> src_rows,
132 Rows<DestinationType> dst_rows,
133 BorderOffsets border_offsets,
134 size_t index) const {
135 2888 BufferType src[7];
136 2888 src[0] = src_rows.at(0, border_offsets.c0())[index];
137 2888 src[1] = src_rows.at(0, border_offsets.c1())[index];
138 2888 src[2] = src_rows.at(0, border_offsets.c2())[index];
139 2888 src[3] = src_rows.at(0, border_offsets.c3())[index];
140 2888 src[4] = src_rows.at(0, border_offsets.c4())[index];
141 2888 src[5] = src_rows.at(0, border_offsets.c5())[index];
142 2888 src[6] = src_rows.at(0, border_offsets.c6())[index];
143 2888 filter_.horizontal_scalar_path(src, &dst_rows[index]);
144 2888 }
145
146 FilterType filter_;
147 }; // end of class SeparableFilter<FilterType, 7UL>
148
149 // Shorthand for 7x7 separable filters driver type.
150 template <class FilterType>
151 using SeparableFilter7x7 = SeparableFilter<FilterType, 7UL>;
152
153 } // namespace KLEIDICV_TARGET_NAMESPACE
154
155 #endif // KLEIDICV_SEPARABLE_FILTER_7X7_NEON_H
156