KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/workspace/separable.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 96 96 100.0%
Functions: 93 93 100.0%
Branches: 56 56 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H
6 #define KLEIDICV_WORKSPACE_SEPARABLE_H
7
8 #include <algorithm>
9 #include <cstddef>
10 #include <cstdlib>
11 #include <memory>
12
13 #include "kleidicv/types.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 // Forward declarations.
18 class SeparableFilterWorkspace;
19
20 // Deleter for SeparableFilterWorkspace instances.
21 class SeparableFilterWorkspaceDeleter {
22 public:
23 2568 void operator()(SeparableFilterWorkspace *ptr) const KLEIDICV_STREAMING {
24 2568 std::free(ptr);
25 2568 };
26 };
27
28 // Workspace for separable fixed-size filters.
29 //
30 // Theory of operation
31 //
32 // Given an NxM input matrix and a separable filter AxB = V x H, this workspace
33 // first processes N rows vertically into a separate horizontal buffer. Right
34 // after the vertical operation, the horizontal operation is applied and the
35 // result is written to the destination.
36 //
37 // Limitations
38 //
39 // 1. In-place operations are not supported.
40 // 2. The input's width and height have to be at least `filter's width - 1` and
41 // `filter's height - 1`, respectively.
42 //
43 // Example
44 //
45 // N = 2, M = 3, A = B = 3, border type replicate and 'x' is multiplication.
46 //
47 // Input: Separated filters:
48 // [ M00, M01, M02 ] V = [ V0 ] H = [H0, H1, H2 ]
49 // [ M10, M11, M12 ] [ V1 ]
50 // [ M20, M21, M22 ] [ V2 ]
51 //
52 // Buffer contents in iteration 0 after applying the vertical operation
53 // taking "replicate" border type into account:
54 //
55 // [ B0, B1, B2 ] =
56 // [ M{0, 0, 1}0 x V, M{0, 0, 1}1 x V, M{0, 0, 1}2 x V ]
57 //
58 // The horizontal operation is then semantically performed on the following
59 // input taking "replicate" border type into account:
60 //
61 // [ B0, B0, B1, B2, B2 ]
62 //
63 // The destination contents after the 0th iteration is then:
64 //
65 // [ D00, D01, D02 ] =
66 // [ B{0, 0, 1} x H, B{0, 1, 2} x H, B{1, 2, 2} x H]
67 //
68 // Handling of borders is calculated based on offsets rather than setting up
69 // suitably-sized buffers which could hold both borders and data.
70 class SeparableFilterWorkspace {
71 public:
72 // To avoid load/store penalties.
73 static constexpr size_t kAlignment = 16UL;
74
75 // Shorthand for std::unique_ptr<> holding a workspace.
76 using Pointer = std::unique_ptr<SeparableFilterWorkspace,
77 SeparableFilterWorkspaceDeleter>;
78
79 // Workspace is only constructible with create().
80 SeparableFilterWorkspace() = delete;
81
82 // Creates a workspace on the heap.
83 2577 static Pointer create(Rectangle rect, size_t channels,
84 size_t intermediate_size) KLEIDICV_STREAMING {
85 2577 size_t buffer_rows_number_of_elements = rect.width() * channels;
86 // Adding more elements because of SVE, where interleaving stores are
87 // governed by one predicate. For example, if a predicate requires 7 uint8_t
88 // elements and an algorithm performs widening to 16 bits, the resulting
89 // interleaving store will still be governed by the same predicate, thus
90 // storing 8 elements. Choosing '3' to account for svst4().
91 2577 buffer_rows_number_of_elements += 3;
92
93 5154 size_t buffer_rows_stride =
94 2577 buffer_rows_number_of_elements * intermediate_size;
95 2577 size_t buffer_rows_size = buffer_rows_stride;
96 2577 buffer_rows_size += kAlignment - 1;
97
98 // Try to allocate workspace at once.
99 5154 size_t allocation_size =
100 2577 sizeof(SeparableFilterWorkspace) + buffer_rows_size;
101 2577 void *allocation = std::malloc(allocation_size);
102 5154 auto workspace = SeparableFilterWorkspace::Pointer{
103 2577 reinterpret_cast<SeparableFilterWorkspace *>(allocation)};
104
105
2/2
✓ Branch 0 taken 2568 times.
✓ Branch 1 taken 9 times.
2577 if (!workspace) {
106 9 return workspace;
107 }
108
109 2568 auto *buffer_rows_address = &workspace->data_[0];
110 2568 buffer_rows_address = align_up(buffer_rows_address, kAlignment);
111 2568 workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0];
112 2568 workspace->buffer_rows_stride_ = buffer_rows_stride;
113 2568 workspace->image_size_ = rect;
114 2568 workspace->channels_ = channels;
115 2568 workspace->intermediate_size_ = intermediate_size;
116
117 2568 return workspace;
118 2577 }
119
120 1713 size_t channels() const { return channels_; }
121 1707 Rectangle image_size() const { return image_size_; }
122 size_t intermediate_size() const { return intermediate_size_; }
123
124 // Processes rows vertically first along the full width
125 template <typename FilterType>
126 1728 void process(Rectangle rect, size_t y_begin, size_t y_end,
127 Rows<const typename FilterType::SourceType> src_rows,
128 Rows<typename FilterType::DestinationType> dst_rows,
129 size_t channels, typename FilterType::BorderType border_type,
130 FilterType filter) KLEIDICV_STREAMING {
131 // Border helper which calculates border offsets.
132 3456 typename FilterType::BorderInfoType vertical_border{rect.height(),
133 1728 border_type};
134 3456 typename FilterType::BorderInfoType horizontal_border{rect.width(),
135 1728 border_type};
136
137 // Buffer rows which hold intermediate widened data.
138 3456 auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>(
139 1728 &data_[buffer_rows_offset_]),
140 1728 buffer_rows_stride_, channels};
141
142 // Vertical processing loop.
143
16/16
✓ Branch 0 taken 555 times.
✓ Branch 1 taken 8292 times.
✓ Branch 2 taken 567 times.
✓ Branch 3 taken 9759 times.
✓ Branch 4 taken 261 times.
✓ Branch 5 taken 2622 times.
✓ Branch 6 taken 96 times.
✓ Branch 7 taken 1368 times.
✓ Branch 8 taken 96 times.
✓ Branch 9 taken 1944 times.
✓ Branch 10 taken 51 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 51 times.
✓ Branch 15 taken 318 times.
26361 for (size_t vertical_index = y_begin; vertical_index < y_end;
144 24633 ++vertical_index) {
145 // Recalculate vertical border offsets.
146 24633 auto offsets = vertical_border.offsets_with_border(vertical_index);
147 // Process in the vertical direction first.
148 49266 filter.process_vertical(rect.width(), src_rows.at(vertical_index),
149 24633 buffer_rows, offsets);
150 // Process in the horizontal direction last.
151 49266 process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index),
152 24633 filter, horizontal_border);
153 24633 }
154 1728 }
155
156 // Processes rows vertically first along the full width
157 template <typename FilterType>
158 39 void process_arbitrary(Rectangle rect, size_t kernel_size, size_t y_begin,
159 size_t y_end,
160 Rows<const typename FilterType::SourceType> src_rows,
161 Rows<typename FilterType::DestinationType> dst_rows,
162 size_t channels,
163 typename FilterType::BorderType /* border_type */,
164 FilterType filter) KLEIDICV_STREAMING {
165 // Buffer rows which hold intermediate widened data.
166 78 auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>(
167 39 &data_[buffer_rows_offset_]),
168 39 buffer_rows_stride_, channels};
169 39 size_t margin = kernel_size / 2;
170
171 // Process top rows, affected by border
172
2/2
✓ Branch 0 taken 180 times.
✓ Branch 1 taken 39 times.
219 for (size_t row_index = y_begin; row_index < std::max(y_begin, margin);
173 180 ++row_index) {
174 360 filter.process_arbitrary_border_vertical(rect.width(), src_rows,
175 180 row_index, buffer_rows);
176 360 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
177 180 buffer_rows, dst_rows.at(row_index));
178 180 }
179
180 // Process middle rows that are not affected by any borders
181
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 39 times.
195 for (size_t row_index = std::max(y_begin, margin);
182 195 row_index < std::min(y_end, rect.height() - margin); ++row_index) {
183 312 filter.process_arbitrary_vertical(rect.width(), src_rows.at(row_index),
184 156 buffer_rows);
185 312 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
186 156 buffer_rows, dst_rows.at(row_index));
187 156 }
188
189 // Process bottom rows, affected by border
190
2/2
✓ Branch 0 taken 39 times.
✓ Branch 1 taken 180 times.
219 for (size_t row_index = std::min(y_end, rect.height() - margin);
191 219 row_index < y_end; ++row_index) {
192 360 filter.process_arbitrary_border_vertical(rect.width(), src_rows,
193 180 row_index, buffer_rows);
194 360 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
195 180 buffer_rows, dst_rows.at(row_index));
196 180 }
197 39 }
198
199 protected:
200 template <typename FilterType>
201 24633 void process_horizontal(size_t width,
202 Rows<typename FilterType::BufferType> buffer_rows,
203 Rows<typename FilterType::DestinationType> dst_rows,
204 FilterType filter,
205 typename FilterType::BorderInfoType horizontal_border)
206 KLEIDICV_STREAMING {
207 // Margin associated with the filter.
208 24633 constexpr size_t margin = filter.margin;
209
210 // Process data affected by left border.
211 KLEIDICV_FORCE_LOOP_UNROLL
212
16/16
✓ Branch 0 taken 10296 times.
✓ Branch 1 taken 8292 times.
✓ Branch 2 taken 13728 times.
✓ Branch 3 taken 9759 times.
✓ Branch 4 taken 5850 times.
✓ Branch 5 taken 2622 times.
✓ Branch 6 taken 9576 times.
✓ Branch 7 taken 1368 times.
✓ Branch 8 taken 19440 times.
✓ Branch 9 taken 1944 times.
✓ Branch 10 taken 114 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 432 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 954 times.
✓ Branch 15 taken 318 times.
85023 for (size_t horizontal_index = 0; horizontal_index < margin;
213 60390 ++horizontal_index) {
214 60390 auto offsets =
215 60390 horizontal_border.offsets_with_left_border(horizontal_index);
216 120780 filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index),
217 60390 dst_rows.at(0, horizontal_index),
218 60390 offsets);
219 60390 }
220
221 // Process data which is not affected by any borders in bulk.
222 {
223 24633 size_t width_without_borders = width - (2 * margin);
224 24633 auto offsets = horizontal_border.offsets_without_border();
225 49266 filter.process_horizontal(width_without_borders,
226 24633 buffer_rows.at(0, margin),
227 24633 dst_rows.at(0, margin), offsets);
228 24633 }
229
230 // Process data affected by right border.
231 KLEIDICV_FORCE_LOOP_UNROLL
232
16/16
✓ Branch 0 taken 8292 times.
✓ Branch 1 taken 10296 times.
✓ Branch 2 taken 9759 times.
✓ Branch 3 taken 13728 times.
✓ Branch 4 taken 2622 times.
✓ Branch 5 taken 5850 times.
✓ Branch 6 taken 1368 times.
✓ Branch 7 taken 9576 times.
✓ Branch 8 taken 1944 times.
✓ Branch 9 taken 19440 times.
✓ Branch 10 taken 114 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 216 times.
✓ Branch 13 taken 432 times.
✓ Branch 14 taken 318 times.
✓ Branch 15 taken 954 times.
85023 for (size_t horizontal_index = 0; horizontal_index < margin;
233 60390 ++horizontal_index) {
234 60390 size_t index = width - margin + horizontal_index;
235 60390 auto offsets = horizontal_border.offsets_with_right_border(index);
236 120780 filter.process_horizontal_borders(buffer_rows.at(0, index),
237 60390 dst_rows.at(0, index), offsets);
238 60390 }
239 24633 }
240
241 // Offset in bytes to the buffer rows from &data_[0].
242 size_t buffer_rows_offset_;
243 // Stride of the buffer rows.
244 size_t buffer_rows_stride_;
245
246 Rectangle image_size_;
247 size_t channels_;
248 size_t intermediate_size_;
249
250 // Workspace area begins here.
251 uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment);
252 }; // end of class SeparableFilterWorkspace
253
254 } // namespace KLEIDICV_TARGET_NAMESPACE
255
256 #endif // KLEIDICV_WORKSPACE_SEPARABLE_H
257