KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/workspace/separable.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 96 96 100.0%
Functions: 93 93 100.0%
Branches: 56 56 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H
6 #define KLEIDICV_WORKSPACE_SEPARABLE_H
7
8 #include <algorithm>
9 #include <cstddef>
10 #include <cstdlib>
11 #include <memory>
12
13 #include "kleidicv/types.h"
14
15 namespace KLEIDICV_TARGET_NAMESPACE {
16
17 // Forward declarations.
18 class SeparableFilterWorkspace;
19
20 // Deleter for SeparableFilterWorkspace instances.
21 class SeparableFilterWorkspaceDeleter {
22 public:
23 3424 void operator()(SeparableFilterWorkspace *ptr) const KLEIDICV_STREAMING {
24 3424 std::free(ptr);
25 3424 };
26 };
27
28 // Workspace for separable fixed-size filters.
29 //
30 // Theory of operation
31 //
32 // Given an NxM input matrix and a separable filter AxB = V x H, this workspace
33 // first processes N rows vertically into a separate horizontal buffer. Right
34 // after the vertical operation, the horizontal operation is applied and the
35 // result is written to the destination.
36 //
37 // Limitations
38 //
39 // 1. In-place operations are not supported.
40 // 2. The input's width and height have to be at least `filter's width - 1` and
41 // `filter's height - 1`, respectively.
42 //
43 // Example
44 //
45 // N = 2, M = 3, A = B = 3, border type replicate and 'x' is multiplication.
46 //
47 // Input: Separated filters:
48 // [ M00, M01, M02 ] V = [ V0 ] H = [H0, H1, H2 ]
49 // [ M10, M11, M12 ] [ V1 ]
50 // [ M20, M21, M22 ] [ V2 ]
51 //
52 // Buffer contents in iteration 0 after applying the vertical operation
53 // taking "replicate" border type into account:
54 //
55 // [ B0, B1, B2 ] =
56 // [ M{0, 0, 1}0 x V, M{0, 0, 1}1 x V, M{0, 0, 1}2 x V ]
57 //
58 // The horizontal operation is then semantically performed on the following
59 // input taking "replicate" border type into account:
60 //
61 // [ B0, B0, B1, B2, B2 ]
62 //
63 // The destination contents after the 0th iteration is then:
64 //
65 // [ D00, D01, D02 ] =
66 // [ B{0, 0, 1} x H, B{0, 1, 2} x H, B{1, 2, 2} x H]
67 //
68 // Handling of borders is calculated based on offsets rather than setting up
69 // suitably-sized buffers which could hold both borders and data.
70 class SeparableFilterWorkspace {
71 public:
72 // To avoid load/store penalties.
73 static constexpr size_t kAlignment = 16UL;
74
75 // Shorthand for std::unique_ptr<> holding a workspace.
76 using Pointer = std::unique_ptr<SeparableFilterWorkspace,
77 SeparableFilterWorkspaceDeleter>;
78
79 // Workspace is only constructible with create().
80 SeparableFilterWorkspace() = delete;
81
82 // Creates a workspace on the heap.
83 3436 static Pointer create(Rectangle rect, size_t channels,
84 size_t intermediate_size) KLEIDICV_STREAMING {
85 3436 size_t buffer_rows_number_of_elements = rect.width() * channels;
86 // Adding more elements because of SVE, where interleaving stores are
87 // governed by one predicate. For example, if a predicate requires 7 uint8_t
88 // elements and an algorithm performs widening to 16 bits, the resulting
89 // interleaving store will still be governed by the same predicate, thus
90 // storing 8 elements. Choosing '3' to account for svst4().
91 3436 buffer_rows_number_of_elements += 3;
92
93 6872 size_t buffer_rows_stride =
94 3436 buffer_rows_number_of_elements * intermediate_size;
95 3436 size_t buffer_rows_size = buffer_rows_stride;
96 3436 buffer_rows_size += kAlignment - 1;
97
98 // Try to allocate workspace at once.
99 6872 size_t allocation_size =
100 3436 sizeof(SeparableFilterWorkspace) + buffer_rows_size;
101 3436 void *allocation = std::malloc(allocation_size);
102 6872 auto workspace = SeparableFilterWorkspace::Pointer{
103 3436 reinterpret_cast<SeparableFilterWorkspace *>(allocation)};
104
105
2/2
✓ Branch 0 taken 3424 times.
✓ Branch 1 taken 12 times.
3436 if (!workspace) {
106 12 return workspace;
107 }
108
109 3424 auto *buffer_rows_address = &workspace->data_[0];
110 3424 buffer_rows_address = align_up(buffer_rows_address, kAlignment);
111 3424 workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0];
112 3424 workspace->buffer_rows_stride_ = buffer_rows_stride;
113 3424 workspace->image_size_ = rect;
114 3424 workspace->channels_ = channels;
115 3424 workspace->intermediate_size_ = intermediate_size;
116
117 3424 return workspace;
118 3436 }
119
120 2284 size_t channels() const { return channels_; }
121 2276 Rectangle image_size() const { return image_size_; }
122 size_t intermediate_size() const { return intermediate_size_; }
123
124 // Processes rows vertically first along the full width
125 template <typename FilterType>
126 2304 void process(Rectangle rect, size_t y_begin, size_t y_end,
127 Rows<const typename FilterType::SourceType> src_rows,
128 Rows<typename FilterType::DestinationType> dst_rows,
129 size_t channels, typename FilterType::BorderType border_type,
130 FilterType filter) KLEIDICV_STREAMING {
131 // Border helper which calculates border offsets.
132 4608 typename FilterType::BorderInfoType vertical_border{rect.height(),
133 2304 border_type};
134 4608 typename FilterType::BorderInfoType horizontal_border{rect.width(),
135 2304 border_type};
136
137 // Buffer rows which hold intermediate widened data.
138 4608 auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>(
139 2304 &data_[buffer_rows_offset_]),
140 2304 buffer_rows_stride_, channels};
141
142 // Vertical processing loop.
143
16/16
✓ Branch 0 taken 740 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 756 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 348 times.
✓ Branch 5 taken 3496 times.
✓ Branch 6 taken 128 times.
✓ Branch 7 taken 1824 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 2592 times.
✓ Branch 10 taken 68 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 68 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 68 times.
✓ Branch 15 taken 424 times.
35340 for (size_t vertical_index = y_begin; vertical_index < y_end;
144 33036 ++vertical_index) {
145 // Recalculate vertical border offsets.
146 33036 auto offsets = vertical_border.offsets_with_border(vertical_index);
147 // Process in the vertical direction first.
148 66072 filter.process_vertical(rect.width(), src_rows.at(vertical_index),
149 33036 buffer_rows, offsets);
150 // Process in the horizontal direction last.
151 66072 process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index),
152 33036 filter, horizontal_border);
153 33036 }
154 2304 }
155
156 // Processes rows vertically first along the full width
157 template <typename FilterType>
158 52 void process_arbitrary(Rectangle rect, size_t kernel_size, size_t y_begin,
159 size_t y_end,
160 Rows<const typename FilterType::SourceType> src_rows,
161 Rows<typename FilterType::DestinationType> dst_rows,
162 size_t channels,
163 typename FilterType::BorderType /* border_type */,
164 FilterType filter) KLEIDICV_STREAMING {
165 // Buffer rows which hold intermediate widened data.
166 104 auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>(
167 52 &data_[buffer_rows_offset_]),
168 52 buffer_rows_stride_, channels};
169 52 size_t margin = kernel_size / 2;
170
171 // Process top rows, affected by border
172
2/2
✓ Branch 0 taken 240 times.
✓ Branch 1 taken 52 times.
292 for (size_t row_index = y_begin; row_index < std::max(y_begin, margin);
173 240 ++row_index) {
174 480 filter.process_arbitrary_border_vertical(rect.width(), src_rows,
175 240 row_index, buffer_rows);
176 480 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
177 240 buffer_rows, dst_rows.at(row_index));
178 240 }
179
180 // Process middle rows that are not affected by any borders
181
2/2
✓ Branch 0 taken 208 times.
✓ Branch 1 taken 52 times.
260 for (size_t row_index = std::max(y_begin, margin);
182 260 row_index < std::min(y_end, rect.height() - margin); ++row_index) {
183 416 filter.process_arbitrary_vertical(rect.width(), src_rows.at(row_index),
184 208 buffer_rows);
185 416 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
186 208 buffer_rows, dst_rows.at(row_index));
187 208 }
188
189 // Process bottom rows, affected by border
190
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 240 times.
292 for (size_t row_index = std::min(y_end, rect.height() - margin);
191 292 row_index < y_end; ++row_index) {
192 480 filter.process_arbitrary_border_vertical(rect.width(), src_rows,
193 240 row_index, buffer_rows);
194 480 filter.process_arbitrary_horizontal(rect.width(), kernel_size,
195 240 buffer_rows, dst_rows.at(row_index));
196 240 }
197 52 }
198
199 protected:
200 template <typename FilterType>
201 33036 void process_horizontal(size_t width,
202 Rows<typename FilterType::BufferType> buffer_rows,
203 Rows<typename FilterType::DestinationType> dst_rows,
204 FilterType filter,
205 typename FilterType::BorderInfoType horizontal_border)
206 KLEIDICV_STREAMING {
207 // Margin associated with the filter.
208 33036 constexpr size_t margin = filter.margin;
209
210 // Process data affected by left border.
211 KLEIDICV_FORCE_LOOP_UNROLL
212
16/16
✓ Branch 0 taken 13856 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 18368 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 7800 times.
✓ Branch 5 taken 3496 times.
✓ Branch 6 taken 12768 times.
✓ Branch 7 taken 1824 times.
✓ Branch 8 taken 25920 times.
✓ Branch 9 taken 2592 times.
✓ Branch 10 taken 152 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 576 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 1272 times.
✓ Branch 15 taken 424 times.
113748 for (size_t horizontal_index = 0; horizontal_index < margin;
213 80712 ++horizontal_index) {
214 80712 auto offsets =
215 80712 horizontal_border.offsets_with_left_border(horizontal_index);
216 161424 filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index),
217 80712 dst_rows.at(0, horizontal_index),
218 80712 offsets);
219 80712 }
220
221 // Process data which is not affected by any borders in bulk.
222 {
223 33036 size_t width_without_borders = width - (2 * margin);
224 33036 auto offsets = horizontal_border.offsets_without_border();
225 66072 filter.process_horizontal(width_without_borders,
226 33036 buffer_rows.at(0, margin),
227 33036 dst_rows.at(0, margin), offsets);
228 33036 }
229
230 // Process data affected by right border.
231 KLEIDICV_FORCE_LOOP_UNROLL
232
16/16
✓ Branch 0 taken 11184 times.
✓ Branch 1 taken 13856 times.
✓ Branch 2 taken 13076 times.
✓ Branch 3 taken 18368 times.
✓ Branch 4 taken 3496 times.
✓ Branch 5 taken 7800 times.
✓ Branch 6 taken 1824 times.
✓ Branch 7 taken 12768 times.
✓ Branch 8 taken 2592 times.
✓ Branch 9 taken 25920 times.
✓ Branch 10 taken 152 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 288 times.
✓ Branch 13 taken 576 times.
✓ Branch 14 taken 424 times.
✓ Branch 15 taken 1272 times.
113748 for (size_t horizontal_index = 0; horizontal_index < margin;
233 80712 ++horizontal_index) {
234 80712 size_t index = width - margin + horizontal_index;
235 80712 auto offsets = horizontal_border.offsets_with_right_border(index);
236 161424 filter.process_horizontal_borders(buffer_rows.at(0, index),
237 80712 dst_rows.at(0, index), offsets);
238 80712 }
239 33036 }
240
241 // Offset in bytes to the buffer rows from &data_[0].
242 size_t buffer_rows_offset_;
243 // Stride of the buffer rows.
244 size_t buffer_rows_stride_;
245
246 Rectangle image_size_;
247 size_t channels_;
248 size_t intermediate_size_;
249
250 // Workspace area begins here.
251 uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment);
252 }; // end of class SeparableFilterWorkspace
253
254 } // namespace KLEIDICV_TARGET_NAMESPACE
255
256 #endif // KLEIDICV_WORKSPACE_SEPARABLE_H
257