KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/filters/separable_filter_3x3_neon.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 88 88 100.0%
Functions: 44 44 100.0%
Branches: 4 4 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H
6 #define KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H
7
8 #include "kleidicv/config.h"
9 #include "kleidicv/neon.h"
10 #include "kleidicv/workspace/border_3x3.h"
11
12 namespace KLEIDICV_TARGET_NAMESPACE {
13
14 // Template for drivers of separable NxM filters.
15 template <typename FilterType, const size_t S>
16 class SeparableFilter;
17
18 // Driver for a separable 3x3 filter.
19 template <typename FilterType>
20 class SeparableFilter<FilterType, 3UL> {
21 public:
22 using SourceType = typename FilterType::SourceType;
23 using BufferType = typename FilterType::BufferType;
24 using DestinationType = typename FilterType::DestinationType;
25 using SourceVecTraits = typename neon::VecTraits<SourceType>;
26 using SourceVectorType = typename SourceVecTraits::VectorType;
27 using BufferVecTraits = typename neon::VecTraits<BufferType>;
28 using BufferVectorType = typename BufferVecTraits::VectorType;
29 using BorderInfoType =
30 typename ::KLEIDICV_TARGET_NAMESPACE::FixedBorderInfo3x3<SourceType>;
31 using BorderType = FixedBorderType;
32 using BorderOffsets = typename BorderInfoType::Offsets;
33
34 230 explicit SeparableFilter(FilterType filter) : filter_{filter} {}
35
36 static constexpr size_t margin = 1UL;
37
38 3968 void process_vertical(size_t width, Rows<const SourceType> src_rows,
39 Rows<BufferType> dst_rows,
40 BorderOffsets border_offsets) const {
41 7936 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
42 3968 SourceVecTraits::num_lanes()};
43
44 4852 loop.unroll_twice([&](size_t index) {
45 884 auto src_0 = &src_rows.at(border_offsets.c0())[index];
46 884 auto src_1 = &src_rows.at(border_offsets.c1())[index];
47 884 auto src_2 = &src_rows.at(border_offsets.c2())[index];
48
49 884 typename SourceVecTraits::Vector2Type src_0_x2;
50 884 SourceVecTraits::load(&src_0[0], src_0_x2);
51 884 typename SourceVecTraits::Vector2Type src_1_x2;
52 884 SourceVecTraits::load(&src_1[0], src_1_x2);
53 884 typename SourceVecTraits::Vector2Type src_2_x2;
54 884 SourceVecTraits::load(&src_2[0], src_2_x2);
55
56 884 SourceVectorType src_a[3], src_b[3];
57 884 src_a[0] = src_0_x2.val[0];
58 884 src_b[0] = src_0_x2.val[1];
59 884 src_a[1] = src_1_x2.val[0];
60 884 src_b[1] = src_1_x2.val[1];
61 884 src_a[2] = src_2_x2.val[0];
62 884 src_b[2] = src_2_x2.val[1];
63
64 884 filter_.vertical_vector_path(src_a, &dst_rows[index]);
65 892 filter_.vertical_vector_path(
66 884 src_b, &dst_rows[index + SourceVecTraits::num_lanes()]);
67 884 });
68
69 5334 loop.unroll_once([&](size_t index) {
70 1366 SourceVectorType src[3];
71 1366 src[0] = vld1q(&src_rows.at(border_offsets.c0())[index]);
72 1366 src[1] = vld1q(&src_rows.at(border_offsets.c1())[index]);
73 1366 src[2] = vld1q(&src_rows.at(border_offsets.c2())[index]);
74 1366 filter_.vertical_vector_path(src, &dst_rows[index]);
75 1366 });
76
77 21560 loop.tail([&](size_t index) {
78 17592 SourceType src[3];
79 17592 src[0] = src_rows.at(border_offsets.c0())[index];
80 17592 src[1] = src_rows.at(border_offsets.c1())[index];
81 17592 src[2] = src_rows.at(border_offsets.c2())[index];
82 17592 filter_.vertical_scalar_path(src, &dst_rows[index]);
83 17592 });
84 3968 }
85
86 3968 void process_horizontal(size_t width, Rows<const BufferType> src_rows,
87 Rows<DestinationType> dst_rows,
88 BorderOffsets border_offsets) const {
89 7936 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
90 3968 BufferVecTraits::num_lanes()};
91
92 5608 loop.unroll_twice([&](size_t index) {
93 1640 auto src_0 = &src_rows.at(0, border_offsets.c0())[index];
94 1640 auto src_1 = &src_rows.at(0, border_offsets.c1())[index];
95 1640 auto src_2 = &src_rows.at(0, border_offsets.c2())[index];
96
97 1640 typename BufferVecTraits::Vector2Type src_0_x2;
98 1640 BufferVecTraits::load(&src_0[0], src_0_x2);
99 1640 typename BufferVecTraits::Vector2Type src_1_x2;
100 1640 BufferVecTraits::load(&src_1[0], src_1_x2);
101 1640 typename BufferVecTraits::Vector2Type src_2_x2;
102 1640 BufferVecTraits::load(&src_2[0], src_2_x2);
103
104 1640 BufferVectorType src_a[3], src_b[3];
105 1640 src_a[0] = src_0_x2.val[0];
106 1640 src_b[0] = src_0_x2.val[1];
107 1640 src_a[1] = src_1_x2.val[0];
108 1640 src_b[1] = src_1_x2.val[1];
109 1640 src_a[2] = src_2_x2.val[0];
110 1640 src_b[2] = src_2_x2.val[1];
111
112 1640 filter_.horizontal_vector_path(src_a, &dst_rows[index]);
113 1648 filter_.horizontal_vector_path(
114 1640 src_b, &dst_rows[index + BufferVecTraits::num_lanes()]);
115 1640 });
116
117 6008 loop.unroll_once([&](size_t index) {
118 2040 BufferVectorType src[3];
119 2040 src[0] = vld1q(&src_rows.at(0, border_offsets.c0())[index]);
120 2040 src[1] = vld1q(&src_rows.at(0, border_offsets.c1())[index]);
121 2040 src[2] = vld1q(&src_rows.at(0, border_offsets.c2())[index]);
122 2040 filter_.horizontal_vector_path(src, &dst_rows[index]);
123 2040 });
124
125 7872 loop.tail([&](size_t index) {
126 3904 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
127 3904 });
128 3968 }
129
130 7936 void process_horizontal_borders(Rows<const BufferType> src_rows,
131 Rows<DestinationType> dst_rows,
132 BorderOffsets border_offsets) const {
133
4/4
✓ Branch 0 taken 4064 times.
✓ Branch 1 taken 7960 times.
✓ Branch 2 taken 3872 times.
✓ Branch 3 taken 7676 times.
23572 for (size_t index = 0; index < src_rows.channels(); ++index) {
134 15636 disable_loop_vectorization();
135 15636 process_horizontal_scalar(src_rows, dst_rows, border_offsets, index);
136 15636 }
137 7936 }
138
139 private:
140 19540 void process_horizontal_scalar(Rows<const BufferType> src_rows,
141 Rows<DestinationType> dst_rows,
142 BorderOffsets border_offsets,
143 size_t index) const {
144 19540 BufferType src[3];
145 19540 src[0] = src_rows.at(0, border_offsets.c0())[index];
146 19540 src[1] = src_rows.at(0, border_offsets.c1())[index];
147 19540 src[2] = src_rows.at(0, border_offsets.c2())[index];
148 19540 filter_.horizontal_scalar_path(src, &dst_rows[index]);
149 19540 }
150
151 FilterType filter_;
152 }; // end of class SeparableFilter<FilterType, 3UL>
153
154 // Shorthand for 3x3 separable filters driver type.
155 template <class FilterType>
156 using SeparableFilter3x3 = SeparableFilter<FilterType, 3UL>;
157
158 } // namespace KLEIDICV_TARGET_NAMESPACE
159
160 #endif // KLEIDICV_SEPARABLE_FILTER_3X3_NEON_H
161