| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 6 | #define KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <cstddef> | ||
| 10 | #include <cstdlib> | ||
| 11 | #include <memory> | ||
| 12 | |||
| 13 | #include "kleidicv/types.h" | ||
| 14 | |||
| 15 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 16 | |||
| 17 | // Forward declarations. | ||
| 18 | class SeparableFilterWorkspace; | ||
| 19 | |||
| 20 | // Deleter for SeparableFilterWorkspace instances. | ||
| 21 | class SeparableFilterWorkspaceDeleter { | ||
| 22 | public: | ||
| 23 | 3424 | void operator()(SeparableFilterWorkspace *ptr) const KLEIDICV_STREAMING { | |
| 24 | 3424 | std::free(ptr); | |
| 25 | 3424 | }; | |
| 26 | }; | ||
| 27 | |||
| 28 | // Workspace for separable fixed-size filters. | ||
| 29 | // | ||
| 30 | // Theory of operation | ||
| 31 | // | ||
| 32 | // Given an NxM input matrix and a separable filter AxB = V x H, this workspace | ||
| 33 | // first processes N rows vertically into a separate horizontal buffer. Right | ||
| 34 | // after the vertical operation, the horizontal operation is applied and the | ||
| 35 | // result is written to the destination. | ||
| 36 | // | ||
| 37 | // Limitations | ||
| 38 | // | ||
| 39 | // 1. In-place operations are not supported. | ||
| 40 | // 2. The input's width and height have to be at least `filter's width - 1` and | ||
| 41 | // `filter's height - 1`, respectively. | ||
| 42 | // | ||
| 43 | // Example | ||
| 44 | // | ||
| 45 | // N = 2, M = 3, A = B = 3, border type replicate and 'x' is multiplication. | ||
| 46 | // | ||
| 47 | // Input: Separated filters: | ||
| 48 | // [ M00, M01, M02 ] V = [ V0 ] H = [H0, H1, H2 ] | ||
| 49 | // [ M10, M11, M12 ] [ V1 ] | ||
| 50 | // [ M20, M21, M22 ] [ V2 ] | ||
| 51 | // | ||
| 52 | // Buffer contents in iteration 0 after applying the vertical operation | ||
| 53 | // taking "replicate" border type into account: | ||
| 54 | // | ||
| 55 | // [ B0, B1, B2 ] = | ||
| 56 | // [ M{0, 0, 1}0 x V, M{0, 0, 1}1 x V, M{0, 0, 1}2 x V ] | ||
| 57 | // | ||
| 58 | // The horizontal operation is then semantically performed on the following | ||
| 59 | // input taking "replicate" border type into account: | ||
| 60 | // | ||
| 61 | // [ B0, B0, B1, B2, B2 ] | ||
| 62 | // | ||
| 63 | // The destination contents after the 0th iteration is then: | ||
| 64 | // | ||
| 65 | // [ D00, D01, D02 ] = | ||
| 66 | // [ B{0, 0, 1} x H, B{0, 1, 2} x H, B{1, 2, 2} x H] | ||
| 67 | // | ||
| 68 | // Handling of borders is calculated based on offsets rather than setting up | ||
| 69 | // suitably-sized buffers which could hold both borders and data. | ||
| 70 | class SeparableFilterWorkspace { | ||
| 71 | public: | ||
| 72 | // To avoid load/store penalties. | ||
| 73 | static constexpr size_t kAlignment = 16UL; | ||
| 74 | |||
| 75 | // Shorthand for std::unique_ptr<> holding a workspace. | ||
| 76 | using Pointer = std::unique_ptr<SeparableFilterWorkspace, | ||
| 77 | SeparableFilterWorkspaceDeleter>; | ||
| 78 | |||
| 79 | // Workspace is only constructible with create(). | ||
| 80 | SeparableFilterWorkspace() = delete; | ||
| 81 | |||
| 82 | // Creates a workspace on the heap. | ||
| 83 | 3436 | static Pointer create(Rectangle rect, size_t channels, | |
| 84 | size_t intermediate_size) KLEIDICV_STREAMING { | ||
| 85 | 3436 | size_t buffer_rows_number_of_elements = rect.width() * channels; | |
| 86 | // Adding more elements because of SVE, where interleaving stores are | ||
| 87 | // governed by one predicate. For example, if a predicate requires 7 uint8_t | ||
| 88 | // elements and an algorithm performs widening to 16 bits, the resulting | ||
| 89 | // interleaving store will still be governed by the same predicate, thus | ||
| 90 | // storing 8 elements. Choosing '3' to account for svst4(). | ||
| 91 | 3436 | buffer_rows_number_of_elements += 3; | |
| 92 | |||
| 93 | 6872 | size_t buffer_rows_stride = | |
| 94 | 3436 | buffer_rows_number_of_elements * intermediate_size; | |
| 95 | 3436 | size_t buffer_rows_size = buffer_rows_stride; | |
| 96 | 3436 | buffer_rows_size += kAlignment - 1; | |
| 97 | |||
| 98 | // Try to allocate workspace at once. | ||
| 99 | 6872 | size_t allocation_size = | |
| 100 | 3436 | sizeof(SeparableFilterWorkspace) + buffer_rows_size; | |
| 101 | 3436 | void *allocation = std::malloc(allocation_size); | |
| 102 | 6872 | auto workspace = SeparableFilterWorkspace::Pointer{ | |
| 103 | 3436 | reinterpret_cast<SeparableFilterWorkspace *>(allocation)}; | |
| 104 | |||
| 105 |
2/2✓ Branch 0 taken 3424 times.
✓ Branch 1 taken 12 times.
|
3436 | if (!workspace) { |
| 106 | 12 | return workspace; | |
| 107 | } | ||
| 108 | |||
| 109 | 3424 | auto *buffer_rows_address = &workspace->data_[0]; | |
| 110 | 3424 | buffer_rows_address = align_up(buffer_rows_address, kAlignment); | |
| 111 | 3424 | workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0]; | |
| 112 | 3424 | workspace->buffer_rows_stride_ = buffer_rows_stride; | |
| 113 | 3424 | workspace->image_size_ = rect; | |
| 114 | 3424 | workspace->channels_ = channels; | |
| 115 | 3424 | workspace->intermediate_size_ = intermediate_size; | |
| 116 | |||
| 117 | 3424 | return workspace; | |
| 118 | 3436 | } | |
| 119 | |||
| 120 | 2284 | size_t channels() const { return channels_; } | |
| 121 | 2276 | Rectangle image_size() const { return image_size_; } | |
| 122 | size_t intermediate_size() const { return intermediate_size_; } | ||
| 123 | |||
| 124 | // Processes rows vertically first along the full width | ||
| 125 | template <typename FilterType> | ||
| 126 | 2304 | void process(Rectangle rect, size_t y_begin, size_t y_end, | |
| 127 | Rows<const typename FilterType::SourceType> src_rows, | ||
| 128 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 129 | size_t channels, typename FilterType::BorderType border_type, | ||
| 130 | FilterType filter) KLEIDICV_STREAMING { | ||
| 131 | // Border helper which calculates border offsets. | ||
| 132 | 4608 | typename FilterType::BorderInfoType vertical_border{rect.height(), | |
| 133 | 2304 | border_type}; | |
| 134 | 4608 | typename FilterType::BorderInfoType horizontal_border{rect.width(), | |
| 135 | 2304 | border_type}; | |
| 136 | |||
| 137 | // Buffer rows which hold intermediate widened data. | ||
| 138 | 4608 | auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>( | |
| 139 | 2304 | &data_[buffer_rows_offset_]), | |
| 140 | 2304 | buffer_rows_stride_, channels}; | |
| 141 | |||
| 142 | // Vertical processing loop. | ||
| 143 |
16/16✓ Branch 0 taken 740 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 756 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 348 times.
✓ Branch 5 taken 3496 times.
✓ Branch 6 taken 128 times.
✓ Branch 7 taken 1824 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 2592 times.
✓ Branch 10 taken 68 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 68 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 68 times.
✓ Branch 15 taken 424 times.
|
35340 | for (size_t vertical_index = y_begin; vertical_index < y_end; |
| 144 | 33036 | ++vertical_index) { | |
| 145 | // Recalculate vertical border offsets. | ||
| 146 | 33036 | auto offsets = vertical_border.offsets_with_border(vertical_index); | |
| 147 | // Process in the vertical direction first. | ||
| 148 | 66072 | filter.process_vertical(rect.width(), src_rows.at(vertical_index), | |
| 149 | 33036 | buffer_rows, offsets); | |
| 150 | // Process in the horizontal direction last. | ||
| 151 | 66072 | process_horizontal(rect.width(), buffer_rows, dst_rows.at(vertical_index), | |
| 152 | 33036 | filter, horizontal_border); | |
| 153 | 33036 | } | |
| 154 | 2304 | } | |
| 155 | |||
| 156 | // Processes rows vertically first along the full width | ||
| 157 | template <typename FilterType> | ||
| 158 | 52 | void process_arbitrary(Rectangle rect, size_t kernel_size, size_t y_begin, | |
| 159 | size_t y_end, | ||
| 160 | Rows<const typename FilterType::SourceType> src_rows, | ||
| 161 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 162 | size_t channels, | ||
| 163 | typename FilterType::BorderType /* border_type */, | ||
| 164 | FilterType filter) KLEIDICV_STREAMING { | ||
| 165 | // Buffer rows which hold intermediate widened data. | ||
| 166 | 104 | auto buffer_rows = Rows{reinterpret_cast<typename FilterType::BufferType *>( | |
| 167 | 52 | &data_[buffer_rows_offset_]), | |
| 168 | 52 | buffer_rows_stride_, channels}; | |
| 169 | 52 | size_t margin = kernel_size / 2; | |
| 170 | |||
| 171 | // Process top rows, affected by border | ||
| 172 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 52 times.
|
292 | for (size_t row_index = y_begin; row_index < std::max(y_begin, margin); |
| 173 | 240 | ++row_index) { | |
| 174 | 480 | filter.process_arbitrary_border_vertical(rect.width(), src_rows, | |
| 175 | 240 | row_index, buffer_rows); | |
| 176 | 480 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
| 177 | 240 | buffer_rows, dst_rows.at(row_index)); | |
| 178 | 240 | } | |
| 179 | |||
| 180 | // Process middle rows that are not affected by any borders | ||
| 181 |
2/2✓ Branch 0 taken 208 times.
✓ Branch 1 taken 52 times.
|
260 | for (size_t row_index = std::max(y_begin, margin); |
| 182 | 260 | row_index < std::min(y_end, rect.height() - margin); ++row_index) { | |
| 183 | 416 | filter.process_arbitrary_vertical(rect.width(), src_rows.at(row_index), | |
| 184 | 208 | buffer_rows); | |
| 185 | 416 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
| 186 | 208 | buffer_rows, dst_rows.at(row_index)); | |
| 187 | 208 | } | |
| 188 | |||
| 189 | // Process bottom rows, affected by border | ||
| 190 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 240 times.
|
292 | for (size_t row_index = std::min(y_end, rect.height() - margin); |
| 191 | 292 | row_index < y_end; ++row_index) { | |
| 192 | 480 | filter.process_arbitrary_border_vertical(rect.width(), src_rows, | |
| 193 | 240 | row_index, buffer_rows); | |
| 194 | 480 | filter.process_arbitrary_horizontal(rect.width(), kernel_size, | |
| 195 | 240 | buffer_rows, dst_rows.at(row_index)); | |
| 196 | 240 | } | |
| 197 | 52 | } | |
| 198 | |||
| 199 | protected: | ||
| 200 | template <typename FilterType> | ||
| 201 | 33036 | void process_horizontal(size_t width, | |
| 202 | Rows<typename FilterType::BufferType> buffer_rows, | ||
| 203 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 204 | FilterType filter, | ||
| 205 | typename FilterType::BorderInfoType horizontal_border) | ||
| 206 | KLEIDICV_STREAMING { | ||
| 207 | // Margin associated with the filter. | ||
| 208 | 33036 | constexpr size_t margin = filter.margin; | |
| 209 | |||
| 210 | // Process data affected by left border. | ||
| 211 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 212 |
16/16✓ Branch 0 taken 13856 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 18368 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 7800 times.
✓ Branch 5 taken 3496 times.
✓ Branch 6 taken 12768 times.
✓ Branch 7 taken 1824 times.
✓ Branch 8 taken 25920 times.
✓ Branch 9 taken 2592 times.
✓ Branch 10 taken 152 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 576 times.
✓ Branch 13 taken 288 times.
✓ Branch 14 taken 1272 times.
✓ Branch 15 taken 424 times.
|
113748 | for (size_t horizontal_index = 0; horizontal_index < margin; |
| 213 | 80712 | ++horizontal_index) { | |
| 214 | 80712 | auto offsets = | |
| 215 | 80712 | horizontal_border.offsets_with_left_border(horizontal_index); | |
| 216 | 161424 | filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), | |
| 217 | 80712 | dst_rows.at(0, horizontal_index), | |
| 218 | 80712 | offsets); | |
| 219 | 80712 | } | |
| 220 | |||
| 221 | // Process data which is not affected by any borders in bulk. | ||
| 222 | { | ||
| 223 | 33036 | size_t width_without_borders = width - (2 * margin); | |
| 224 | 33036 | auto offsets = horizontal_border.offsets_without_border(); | |
| 225 | 66072 | filter.process_horizontal(width_without_borders, | |
| 226 | 33036 | buffer_rows.at(0, margin), | |
| 227 | 33036 | dst_rows.at(0, margin), offsets); | |
| 228 | 33036 | } | |
| 229 | |||
| 230 | // Process data affected by right border. | ||
| 231 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 232 |
16/16✓ Branch 0 taken 11184 times.
✓ Branch 1 taken 13856 times.
✓ Branch 2 taken 13076 times.
✓ Branch 3 taken 18368 times.
✓ Branch 4 taken 3496 times.
✓ Branch 5 taken 7800 times.
✓ Branch 6 taken 1824 times.
✓ Branch 7 taken 12768 times.
✓ Branch 8 taken 2592 times.
✓ Branch 9 taken 25920 times.
✓ Branch 10 taken 152 times.
✓ Branch 11 taken 152 times.
✓ Branch 12 taken 288 times.
✓ Branch 13 taken 576 times.
✓ Branch 14 taken 424 times.
✓ Branch 15 taken 1272 times.
|
113748 | for (size_t horizontal_index = 0; horizontal_index < margin; |
| 233 | 80712 | ++horizontal_index) { | |
| 234 | 80712 | size_t index = width - margin + horizontal_index; | |
| 235 | 80712 | auto offsets = horizontal_border.offsets_with_right_border(index); | |
| 236 | 161424 | filter.process_horizontal_borders(buffer_rows.at(0, index), | |
| 237 | 80712 | dst_rows.at(0, index), offsets); | |
| 238 | 80712 | } | |
| 239 | 33036 | } | |
| 240 | |||
| 241 | // Offset in bytes to the buffer rows from &data_[0]. | ||
| 242 | size_t buffer_rows_offset_; | ||
| 243 | // Stride of the buffer rows. | ||
| 244 | size_t buffer_rows_stride_; | ||
| 245 | |||
| 246 | Rectangle image_size_; | ||
| 247 | size_t channels_; | ||
| 248 | size_t intermediate_size_; | ||
| 249 | |||
| 250 | // Workspace area begins here. | ||
| 251 | uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(kAlignment); | ||
| 252 | }; // end of class SeparableFilterWorkspace | ||
| 253 | |||
| 254 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 255 | |||
| 256 | #endif // KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 257 |