Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H | ||
6 | #define KLEIDICV_WORKSPACE_SEPARABLE_H | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <cstddef> | ||
10 | #include <cstdlib> | ||
11 | #include <memory> | ||
12 | |||
13 | #include "kleidicv/types.h" | ||
14 | |||
15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
16 | |||
17 | // Forward declarations. | ||
18 | class SeparableFilterWorkspace; | ||
19 | |||
20 | // Deleter for SeparableFilterWorkspace instances. | ||
21 | class SeparableFilterWorkspaceDeleter { | ||
22 | public: | ||
23 | 2568 | void operator()(SeparableFilterWorkspace *ptr) const KLEIDICV_STREAMING { | |
24 | 2568 | std::free(ptr); | |
25 | 2568 | }; | |
26 | }; | ||
27 | |||
28 | // Workspace for separable fixed-size filters. | ||
29 | // | ||
30 | // Theory of operation | ||
31 | // | ||
32 | // Given an NxM input matrix and a separable filter AxB = V x H, this workspace | ||
33 | // first processes N rows vertically into a separate horizontal buffer. Right | ||
34 | // after the vertical operation, the horizontal operation is applied and the | ||
35 | // result is written to the destination. | ||
36 | // | ||
37 | // Limitations | ||
38 | // | ||
39 | // 1. In-place operations are not supported. | ||
40 | // 2. The input's width and height have to be at least `filter's width - 1` and | ||
41 | // `filter's height - 1`, respectively. | ||
42 | // | ||
43 | // Example | ||
44 | // | ||
45 | // N = 2, M = 3, A = B = 3, border type replicate and 'x' is multiplication. | ||
46 | // | ||
47 | // Input: Separated filters: | ||
48 | // [ M00, M01, M02 ] V = [ V0 ] H = [H0, H1, H2 ] | ||
49 | // [ M10, M11, M12 ] [ V1 ] | ||
50 | // [ M20, M21, M22 ] [ V2 ] | ||
51 | // | ||
52 | // Buffer contents in iteration 0 after applying the vertical operation | ||
53 | // taking "replicate" border type into account: | ||
54 | // | ||
55 | // [ B0, B1, B2 ] = | ||
56 | // [ M{0, 0, 1}0 x V, M{0, 0, 1}1 x V, M{0, 0, 1}2 x V ] | ||
57 | // | ||
58 | // The horizontal operation is then semantically performed on the following | ||
59 | // input taking "replicate" border type into account: | ||
60 | // | ||
61 | // [ B0, B0, B1, B2, B2 ] | ||
62 | // | ||
63 | // The destination contents after the 0th iteration is then: | ||
64 | // | ||
65 | // [ D00, D01, D02 ] = | ||
66 | // [ B{0, 0, 1} x H, B{0, 1, 2} x H, B{1, 2, 2} x H] | ||
67 | // | ||
68 | // Handling of borders is calculated based on offsets rather than setting up | ||
69 | // suitably-sized buffers which could hold both borders and data. | ||
70 | class SeparableFilterWorkspace { | ||
71 | public: | ||
72 | // To avoid load/store penalties. | ||
73 | static constexpr size_t kAlignment = 16UL; | ||
74 | |||
75 | // Shorthand for std::unique_ptr<> holding a workspace. | ||
76 | using Pointer = std::unique_ptr<SeparableFilterWorkspace, | ||
77 | SeparableFilterWorkspaceDeleter>; | ||
78 | |||
79 | // Workspace is only constructible with create(). | ||
80 | SeparableFilterWorkspace() = delete; | ||
81 | |||
82 | // Creates a workspace on the heap. | ||
83 | 2577 | static Pointer create(Rectangle rect, size_t channels, | |
84 | size_t intermediate_size) KLEIDICV_STREAMING { | ||
85 | 2577 | size_t buffer_rows_number_of_elements = rect.width() * channels; | |
86 | // Adding more elements because of SVE, where interleaving stores are | ||
87 | // governed by one predicate. For example, if a predicate requires 7 uint8_t | ||
88 | // elements and an algorithm performs widening to 16 bits, the resulting | ||
89 | // interleaving store will still be governed by the same predicate, thus | ||
90 | // storing 8 elements. Choosing '3' to account for svst4(). | ||
91 | 2577 | buffer_rows_number_of_elements += 3; | |
92 | |||
93 | 5154 | size_t buffer_rows_stride = | |
94 | 2577 | buffer_rows_number_of_elements * intermediate_size; | |
95 | 2577 | size_t buffer_rows_size = buffer_rows_stride; | |
96 | 2577 | buffer_rows_size += kAlignment - 1; | |
97 | |||
98 | // Try to allocate workspace at once. | ||
99 | 5154 | size_t allocation_size = | |
100 | 2577 | sizeof(SeparableFilterWorkspace) + buffer_rows_size; | |
101 | 2577 | void *allocation = std::malloc(allocation_size); | |
102 | 5154 | auto workspace = SeparableFilterWorkspace::Pointer{ | |
103 | 2577 | reinterpret_cast<SeparableFilterWorkspace *>(allocation)}; | |
104 | |||
105 |
2/2✓ Branch 0 taken 2568 times.
✓ Branch 1 taken 9 times.
|
2577 | if (!workspace) { |
106 | 9 | return workspace; | |
107 | } | ||
108 | |||
109 | 2568 | auto *buffer_rows_address = &workspace->data_[0]; | |
110 | 2568 | buffer_rows_address = align_up(buffer_rows_address, kAlignment); | |
111 | 2568 | workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0]; | |
112 | 2568 | workspace->buffer_rows_stride_ = buffer_rows_stride; | |
113 | 2568 | workspace->image_size_ = rect; | |
114 | 2568 | workspace->channels_ = channels; | |
115 | 2568 | workspace->intermediate_size_ = intermediate_size; | |
116 | |||
117 | 2568 | return workspace; | |
118 | 2577 | } | |
119 | |||
120 | 1713 | size_t channels() const { return channels_; } | |
121 | 1707 | Rectangle image_size() const { return image_size_; } | |
122 | size_t intermediate_size() const { return intermediate_size_; } | ||
123 | |||
124 | // Processes rows vertically first along the full width | ||
125 | template <typename FilterType> | ||
126 | 1728 | void process(Rectangle rect, size_t y_begin, size_t y_end, | |
127 | Rows<const typename FilterType::SourceType> src_rows, | ||
128 | Rows<typename FilterType::DestinationType> dst_rows, | ||
129 | size_t channels, typename FilterType::BorderType border_type, | ||
130 | FilterType filter) KLEIDICV_STREAMING { | ||
131 | // Border helper which calculates border offsets. | ||
132 | 3456 | typename FilterType::BorderInfoType vertical_border{rect.height(), | |
133 | 1728 | border_type}; | |
134 | 3456 | typename FilterType::BorderInfoType horizontal_border{rect.width(), | |
135 | 1728 | border_type}; | |
136 | |||
137 | // Buffer rows which hold intermediate widened data. | ||
138 | 3456 | auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>( | |
139 | 1728 | &data_[buffer_rows_offset_]), | |
140 | 1728 | buffer_rows_stride_, channels}; | |
141 | |||
142 | // Vertical processing loop. | ||
143 |
16/16✓ Branch 0 taken 555 times.
✓ Branch 1 taken 8292 times.
✓ Branch 2 taken 567 times.
✓ Branch 3 taken 9759 times.
✓ Branch 4 taken 261 times.
✓ Branch 5 taken 2622 times.
✓ Branch 6 taken 96 times.
✓ Branch 7 taken 1368 times.
✓ Branch 8 taken 96 times.
✓ Branch 9 taken 1944 times.
✓ Branch 10 taken 51 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 51 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 51 times.
✓ Branch 15 taken 318 times.
|
26361 | for (size_t vertical_index = y_begin; vertical_index < y_end; |
144 | 24633 | ++vertical_index) { | |
145 | // Recalculate vertical border offsets. | ||
146 | 24633 | auto offsets = vertical_border.offsets_with_border(vertical_index); | |
147 | // Process in the vertical direction first. | ||
148 | 49266 | filter.process_vertical(rect.width(), src_rows.at(vertical_index), | |
149 | 24633 | buffer_rows, offsets); | |
150 | // Process in the horizontal direction last. | ||
151 | 49266 | process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), | |
152 | 24633 | filter, horizontal_border); | |
153 | 24633 | } | |
154 | 1728 | } | |
155 | |||
156 | // Processes rows vertically first along the full width | ||
157 | template <typename FilterType> | ||
158 | 39 | void process_arbitrary(Rectangle rect, size_t kernel_size, size_t y_begin, | |
159 | size_t y_end, | ||
160 | Rows<const typename FilterType::SourceType> src_rows, | ||
161 | Rows<typename FilterType::DestinationType> dst_rows, | ||
162 | size_t channels, | ||
163 | typename FilterType::BorderType /* border_type */, | ||
164 | FilterType filter) KLEIDICV_STREAMING { | ||
165 | // Buffer rows which hold intermediate widened data. | ||
166 | 78 | auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>( | |
167 | 39 | &data_[buffer_rows_offset_]), | |
168 | 39 | buffer_rows_stride_, channels}; | |
169 | 39 | size_t margin = kernel_size / 2; | |
170 | |||
171 | // Process top rows, affected by border | ||
172 |
2/2✓ Branch 0 taken 180 times.
✓ Branch 1 taken 39 times.
|
219 | for (size_t row_index = y_begin; row_index < std::max(y_begin, margin); |
173 | 180 | ++row_index) { | |
174 | 360 | filter.process_arbitrary_border_vertical(rect.width(), src_rows, | |
175 | 180 | row_index, buffer_rows); | |
176 | 360 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
177 | 180 | buffer_rows, dst_rows.at(row_index)); | |
178 | 180 | } | |
179 | |||
180 | // Process middle rows that are not affected by any borders | ||
181 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 39 times.
|
195 | for (size_t row_index = std::max(y_begin, margin); |
182 | 195 | row_index < std::min(y_end, rect.height() - margin); ++row_index) { | |
183 | 312 | filter.process_arbitrary_vertical(rect.width(), src_rows.at(row_index), | |
184 | 156 | buffer_rows); | |
185 | 312 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
186 | 156 | buffer_rows, dst_rows.at(row_index)); | |
187 | 156 | } | |
188 | |||
189 | // Process bottom rows, affected by border | ||
190 |
2/2✓ Branch 0 taken 39 times.
✓ Branch 1 taken 180 times.
|
219 | for (size_t row_index = std::min(y_end, rect.height() - margin); |
191 | 219 | row_index < y_end; ++row_index) { | |
192 | 360 | filter.process_arbitrary_border_vertical(rect.width(), src_rows, | |
193 | 180 | row_index, buffer_rows); | |
194 | 360 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
195 | 180 | buffer_rows, dst_rows.at(row_index)); | |
196 | 180 | } | |
197 | 39 | } | |
198 | |||
199 | protected: | ||
200 | template <typename FilterType> | ||
201 | 24633 | void process_horizontal(size_t width, | |
202 | Rows<typename FilterType::BufferType> buffer_rows, | ||
203 | Rows<typename FilterType::DestinationType> dst_rows, | ||
204 | FilterType filter, | ||
205 | typename FilterType::BorderInfoType horizontal_border) | ||
206 | KLEIDICV_STREAMING { | ||
207 | // Margin associated with the filter. | ||
208 | 24633 | constexpr size_t margin = filter.margin; | |
209 | |||
210 | // Process data affected by left border. | ||
211 | KLEIDICV_FORCE_LOOP_UNROLL | ||
212 |
16/16✓ Branch 0 taken 10296 times.
✓ Branch 1 taken 8292 times.
✓ Branch 2 taken 13728 times.
✓ Branch 3 taken 9759 times.
✓ Branch 4 taken 5850 times.
✓ Branch 5 taken 2622 times.
✓ Branch 6 taken 9576 times.
✓ Branch 7 taken 1368 times.
✓ Branch 8 taken 19440 times.
✓ Branch 9 taken 1944 times.
✓ Branch 10 taken 114 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 432 times.
✓ Branch 13 taken 216 times.
✓ Branch 14 taken 954 times.
✓ Branch 15 taken 318 times.
|
85023 | for (size_t horizontal_index = 0; horizontal_index < margin; |
213 | 60390 | ++horizontal_index) { | |
214 | 60390 | auto offsets = | |
215 | 60390 | horizontal_border.offsets_with_left_border(horizontal_index); | |
216 | 120780 | filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), | |
217 | 60390 | dst_rows.at(0, horizontal_index), | |
218 | 60390 | offsets); | |
219 | 60390 | } | |
220 | |||
221 | // Process data which is not affected by any borders in bulk. | ||
222 | { | ||
223 | 24633 | size_t width_without_borders = width - (2 * margin); | |
224 | 24633 | auto offsets = horizontal_border.offsets_without_border(); | |
225 | 49266 | filter.process_horizontal(width_without_borders, | |
226 | 24633 | buffer_rows.at(0, margin), | |
227 | 24633 | dst_rows.at(0, margin), offsets); | |
228 | 24633 | } | |
229 | |||
230 | // Process data affected by right border. | ||
231 | KLEIDICV_FORCE_LOOP_UNROLL | ||
232 |
16/16✓ Branch 0 taken 8292 times.
✓ Branch 1 taken 10296 times.
✓ Branch 2 taken 9759 times.
✓ Branch 3 taken 13728 times.
✓ Branch 4 taken 2622 times.
✓ Branch 5 taken 5850 times.
✓ Branch 6 taken 1368 times.
✓ Branch 7 taken 9576 times.
✓ Branch 8 taken 1944 times.
✓ Branch 9 taken 19440 times.
✓ Branch 10 taken 114 times.
✓ Branch 11 taken 114 times.
✓ Branch 12 taken 216 times.
✓ Branch 13 taken 432 times.
✓ Branch 14 taken 318 times.
✓ Branch 15 taken 954 times.
|
85023 | for (size_t horizontal_index = 0; horizontal_index < margin; |
233 | 60390 | ++horizontal_index) { | |
234 | 60390 | size_t index = width - margin + horizontal_index; | |
235 | 60390 | auto offsets = horizontal_border.offsets_with_right_border(index); | |
236 | 120780 | filter.process_horizontal_borders(buffer_rows.at(0, index), | |
237 | 60390 | dst_rows.at(0, index), offsets); | |
238 | 60390 | } | |
239 | 24633 | } | |
240 | |||
241 | // Offset in bytes to the buffer rows from &data_[0]. | ||
242 | size_t buffer_rows_offset_; | ||
243 | // Stride of the buffer rows. | ||
244 | size_t buffer_rows_stride_; | ||
245 | |||
246 | Rectangle image_size_; | ||
247 | size_t channels_; | ||
248 | size_t intermediate_size_; | ||
249 | |||
250 | // Workspace area begins here. | ||
251 | uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); | ||
252 | }; // end of class SeparableFilterWorkspace | ||
253 | |||
254 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
255 | |||
256 | #endif // KLEIDICV_WORKSPACE_SEPARABLE_H | ||
257 |