| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2023 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #ifndef KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 6 | #define KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <cstddef> | ||
| 10 | #include <cstdint> | ||
| 11 | #include <cstdlib> | ||
| 12 | #include <memory> | ||
| 13 | #include <utility> | ||
| 14 | #include <variant> | ||
| 15 | |||
| 16 | #include "kleidicv/types.h" | ||
| 17 | |||
| 18 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
| 19 | |||
| 20 | // Workspace for separable fixed-size filters. | ||
| 21 | // | ||
| 22 | // Theory of operation | ||
| 23 | // | ||
| 24 | // Given an NxM input matrix and a separable filter AxB = V x H, this workspace | ||
| 25 | // first processes N rows vertically into a separate horizontal buffer. Right | ||
| 26 | // after the vertical operation, the horizontal operation is applied and the | ||
| 27 | // result is written to the destination. | ||
| 28 | // | ||
| 29 | // Limitations | ||
| 30 | // | ||
| 31 | // 1. In-place operations are not supported. | ||
| 32 | // 2. The input's width and height have to be at least `filter's width - 1` and | ||
| 33 | // `filter's height - 1`, respectively. | ||
| 34 | // | ||
| 35 | // Example | ||
| 36 | // | ||
| 37 | // N = 2, M = 3, A = B = 3, border type replicate and 'x' is multiplication. | ||
| 38 | // | ||
| 39 | // Input: Separated filters: | ||
| 40 | // [ M00, M01, M02 ] V = [ V0 ] H = [H0, H1, H2 ] | ||
| 41 | // [ M10, M11, M12 ] [ V1 ] | ||
| 42 | // [ M20, M21, M22 ] [ V2 ] | ||
| 43 | // | ||
| 44 | // Buffer contents in iteration 0 after applying the vertical operation | ||
| 45 | // taking "replicate" border type into account: | ||
| 46 | // | ||
| 47 | // [ B0, B1, B2 ] = | ||
| 48 | // [ M{0, 0, 1}0 x V, M{0, 0, 1}1 x V, M{0, 0, 1}2 x V ] | ||
| 49 | // | ||
| 50 | // The horizontal operation is then semantically performed on the following | ||
| 51 | // input taking "replicate" border type into account: | ||
| 52 | // | ||
| 53 | // [ B0, B0, B1, B2, B2 ] | ||
| 54 | // | ||
| 55 | // The destination contents after the 0th iteration is then: | ||
| 56 | // | ||
| 57 | // [ D00, D01, D02 ] = | ||
| 58 | // [ B{0, 0, 1} x H, B{0, 1, 2} x H, B{1, 2, 2} x H] | ||
| 59 | // | ||
| 60 | // Handling of borders is calculated based on offsets rather than setting up | ||
| 61 | // suitably-sized buffers which could hold both borders and data. | ||
| 62 | class SeparableFilterWorkspace { | ||
| 63 | public: | ||
| 64 | // Workspace is only constructible with create(). | ||
| 65 | SeparableFilterWorkspace() = delete; | ||
| 66 | |||
| 67 | 3216 | static std::variant<SeparableFilterWorkspace, kleidicv_error_t> create( | |
| 68 | Rectangle rect, size_t channels, | ||
| 69 | size_t intermediate_size) KLEIDICV_STREAMING { | ||
| 70 | 6432 | auto [allocation, buffer_rows_stride] = | |
| 71 | 3216 | allocate(rect, channels, intermediate_size); | |
| 72 | |||
| 73 |
2/2✓ Branch 0 taken 3156 times.
✓ Branch 1 taken 60 times.
|
3216 | if (!allocation) { |
| 74 | 60 | return KLEIDICV_ERROR_ALLOCATION; | |
| 75 | } | ||
| 76 | |||
| 77 | 6312 | return SeparableFilterWorkspace{rect, channels, allocation, | |
| 78 | 3156 | buffer_rows_stride}; | |
| 79 | 3216 | } | |
| 80 | |||
| 81 | protected: | ||
| 82 | 3768 | static std::pair<uint8_t *, size_t> allocate(Rectangle rect, size_t channels, | |
| 83 | size_t intermediate_size) | ||
| 84 | KLEIDICV_STREAMING { | ||
| 85 | 3768 | size_t buffer_rows_number_of_elements = rect.width() * channels; | |
| 86 | // Adding more elements because of SVE, where interleaving stores are | ||
| 87 | // governed by one predicate. For example, if a predicate requires 7 uint8_t | ||
| 88 | // elements and an algorithm performs widening to 16 bits, the resulting | ||
| 89 | // interleaving store will still be governed by the same predicate, thus | ||
| 90 | // storing 8 elements. Choosing '3' to account for svst4(). | ||
| 91 | 3768 | buffer_rows_number_of_elements += 3; | |
| 92 | |||
| 93 | 7536 | size_t buffer_rows_stride = | |
| 94 | 3768 | buffer_rows_number_of_elements * intermediate_size; | |
| 95 | |||
| 96 | 7536 | uint8_t *allocation = | |
| 97 | 3768 | reinterpret_cast<uint8_t *>(std::malloc(buffer_rows_stride)); | |
| 98 | |||
| 99 | 3768 | return {allocation, buffer_rows_stride}; | |
| 100 | 3768 | } | |
| 101 | |||
| 102 | 3704 | SeparableFilterWorkspace(Rectangle rect, size_t channels, uint8_t *allocation, | |
| 103 | size_t buffer_rows_stride) KLEIDICV_STREAMING | ||
| 104 | 3704 | : rect_{rect}, | |
| 105 | 3704 | channels_{channels}, | |
| 106 | 3704 | buffer_{allocation, &std::free}, | |
| 107 | 3704 | buffer_rows_stride_{buffer_rows_stride} {} | |
| 108 | |||
| 109 | public: | ||
| 110 | // Processes rows vertically first along the full width | ||
| 111 | template <typename FilterType> | ||
| 112 | 2688 | void process(size_t y_begin, size_t y_end, | |
| 113 | Rows<const typename FilterType::SourceType> src_rows, | ||
| 114 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 115 | typename FilterType::BorderType border_type, | ||
| 116 | FilterType filter) KLEIDICV_STREAMING { | ||
| 117 | // Border helper which calculates border offsets. | ||
| 118 | 5376 | typename FilterType::BorderInfoType vertical_border{rect_.height(), | |
| 119 | 2688 | border_type}; | |
| 120 | 5376 | typename FilterType::BorderInfoType horizontal_border{rect_.width(), | |
| 121 | 2688 | border_type}; | |
| 122 | |||
| 123 | // Buffer rows which hold intermediate widened data. | ||
| 124 | 2688 | auto buffer_rows = | |
| 125 | 5376 | Rows{reinterpret_cast<typename FilterType::BufferType *>(buffer_.get()), | |
| 126 | 2688 | buffer_rows_stride_, channels_}; | |
| 127 | |||
| 128 | // Vertical processing loop. | ||
| 129 |
20/20✓ Branch 0 taken 884 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 940 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 132 times.
✓ Branch 5 taken 808 times.
✓ Branch 6 taken 204 times.
✓ Branch 7 taken 1888 times.
✓ Branch 8 taken 128 times.
✓ Branch 9 taken 1824 times.
✓ Branch 10 taken 128 times.
✓ Branch 11 taken 2592 times.
✓ Branch 12 taken 68 times.
✓ Branch 13 taken 152 times.
✓ Branch 14 taken 68 times.
✓ Branch 15 taken 288 times.
✓ Branch 16 taken 68 times.
✓ Branch 17 taken 424 times.
✓ Branch 18 taken 68 times.
✓ Branch 19 taken 560 times.
|
35484 | for (size_t vertical_index = y_begin; vertical_index < y_end; |
| 130 | 32796 | ++vertical_index) { | |
| 131 | // Recalculate vertical border offsets. | ||
| 132 | 32796 | auto offsets = vertical_border.offsets_with_border(vertical_index); | |
| 133 | // Process in the vertical direction first. | ||
| 134 | 65592 | filter.process_vertical(rect_.width(), src_rows.at(vertical_index), | |
| 135 | 32796 | buffer_rows, offsets); | |
| 136 | // Process in the horizontal direction last. | ||
| 137 | 65592 | process_horizontal(rect_.width(), buffer_rows, | |
| 138 | 32796 | dst_rows.at(vertical_index), filter, | |
| 139 | 32796 | horizontal_border); | |
| 140 | 32796 | } | |
| 141 | 2688 | } | |
| 142 | |||
| 143 | // Processes rows vertically first along the full width | ||
| 144 | template <typename FilterType> | ||
| 145 | 64 | void process_arbitrary(size_t kernel_size, size_t y_begin, size_t y_end, | |
| 146 | Rows<const typename FilterType::SourceType> src_rows, | ||
| 147 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 148 | typename FilterType::BorderType /* border_type */, | ||
| 149 | FilterType filter) KLEIDICV_STREAMING { | ||
| 150 | // Buffer rows which hold intermediate widened data. | ||
| 151 | 64 | auto buffer_rows = | |
| 152 | 128 | Rows{reinterpret_cast<typename FilterType::BufferType *>(buffer_.get()), | |
| 153 | 64 | buffer_rows_stride_, channels_}; | |
| 154 | 64 | size_t margin = kernel_size / 2; | |
| 155 | |||
| 156 | // Process top rows, affected by border | ||
| 157 |
2/2✓ Branch 0 taken 240 times.
✓ Branch 1 taken 64 times.
|
304 | for (size_t row_index = y_begin; row_index < std::max(y_begin, margin); |
| 158 | 240 | ++row_index) { | |
| 159 | 480 | filter.process_arbitrary_border_vertical(rect_.width(), src_rows, | |
| 160 | 240 | row_index, buffer_rows); | |
| 161 | 480 | filter.process_arbitrary_horizontal(rect_.width(), kernel_size, | |
| 162 | 240 | buffer_rows, dst_rows.at(row_index)); | |
| 163 | 240 | } | |
| 164 | |||
| 165 | // Process middle rows that are not affected by any borders | ||
| 166 |
2/2✓ Branch 0 taken 208 times.
✓ Branch 1 taken 64 times.
|
272 | for (size_t row_index = std::max(y_begin, margin); |
| 167 | 272 | row_index < std::min(y_end, rect_.height() - margin); ++row_index) { | |
| 168 | 416 | filter.process_arbitrary_vertical(rect_.width(), src_rows.at(row_index), | |
| 169 | 208 | buffer_rows); | |
| 170 | 416 | filter.process_arbitrary_horizontal(rect_.width(), kernel_size, | |
| 171 | 208 | buffer_rows, dst_rows.at(row_index)); | |
| 172 | 208 | } | |
| 173 | |||
| 174 | // Process bottom rows, affected by border | ||
| 175 |
2/2✓ Branch 0 taken 64 times.
✓ Branch 1 taken 240 times.
|
304 | for (size_t row_index = std::min(y_end, rect_.height() - margin); |
| 176 | 304 | row_index < y_end; ++row_index) { | |
| 177 | 480 | filter.process_arbitrary_border_vertical(rect_.width(), src_rows, | |
| 178 | 240 | row_index, buffer_rows); | |
| 179 | 480 | filter.process_arbitrary_horizontal(rect_.width(), kernel_size, | |
| 180 | 240 | buffer_rows, dst_rows.at(row_index)); | |
| 181 | 240 | } | |
| 182 | 64 | } | |
| 183 | |||
| 184 | private: | ||
| 185 | template <typename FilterType> | ||
| 186 | 32796 | void process_horizontal(size_t width, | |
| 187 | Rows<typename FilterType::BufferType> buffer_rows, | ||
| 188 | Rows<typename FilterType::DestinationType> dst_rows, | ||
| 189 | FilterType filter, | ||
| 190 | typename FilterType::BorderInfoType horizontal_border) | ||
| 191 | KLEIDICV_STREAMING { | ||
| 192 | // Margin associated with the filter. | ||
| 193 | 32796 | constexpr size_t margin = filter.margin; | |
| 194 | |||
| 195 | // Process data affected by left border. | ||
| 196 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 197 |
20/20✓ Branch 0 taken 13856 times.
✓ Branch 1 taken 11184 times.
✓ Branch 2 taken 18368 times.
✓ Branch 3 taken 13076 times.
✓ Branch 4 taken 2424 times.
✓ Branch 5 taken 808 times.
✓ Branch 6 taken 7552 times.
✓ Branch 7 taken 1888 times.
✓ Branch 8 taken 12768 times.
✓ Branch 9 taken 1824 times.
✓ Branch 10 taken 25920 times.
✓ Branch 11 taken 2592 times.
✓ Branch 12 taken 152 times.
✓ Branch 13 taken 152 times.
✓ Branch 14 taken 576 times.
✓ Branch 15 taken 288 times.
✓ Branch 16 taken 1272 times.
✓ Branch 17 taken 424 times.
✓ Branch 18 taken 2240 times.
✓ Branch 19 taken 560 times.
|
117924 | for (size_t horizontal_index = 0; horizontal_index < margin; |
| 198 | 85128 | ++horizontal_index) { | |
| 199 | 85128 | auto offsets = | |
| 200 | 85128 | horizontal_border.offsets_with_left_border(horizontal_index); | |
| 201 | 170256 | filter.process_horizontal_borders(buffer_rows.at(0, horizontal_index), | |
| 202 | 85128 | dst_rows.at(0, horizontal_index), | |
| 203 | 85128 | offsets); | |
| 204 | 85128 | } | |
| 205 | |||
| 206 | // Process data which is not affected by any borders in bulk. | ||
| 207 | { | ||
| 208 | 32796 | size_t width_without_borders = width - (2 * margin); | |
| 209 | 32796 | auto offsets = horizontal_border.offsets_without_border(); | |
| 210 | 65592 | filter.process_horizontal(width_without_borders, | |
| 211 | 32796 | buffer_rows.at(0, margin), | |
| 212 | 32796 | dst_rows.at(0, margin), offsets); | |
| 213 | 32796 | } | |
| 214 | |||
| 215 | // Process data affected by right border. | ||
| 216 | KLEIDICV_FORCE_LOOP_UNROLL | ||
| 217 |
20/20✓ Branch 0 taken 11184 times.
✓ Branch 1 taken 13856 times.
✓ Branch 2 taken 13076 times.
✓ Branch 3 taken 18368 times.
✓ Branch 4 taken 808 times.
✓ Branch 5 taken 2424 times.
✓ Branch 6 taken 1888 times.
✓ Branch 7 taken 7552 times.
✓ Branch 8 taken 1824 times.
✓ Branch 9 taken 12768 times.
✓ Branch 10 taken 2592 times.
✓ Branch 11 taken 25920 times.
✓ Branch 12 taken 152 times.
✓ Branch 13 taken 152 times.
✓ Branch 14 taken 288 times.
✓ Branch 15 taken 576 times.
✓ Branch 16 taken 424 times.
✓ Branch 17 taken 1272 times.
✓ Branch 18 taken 560 times.
✓ Branch 19 taken 2240 times.
|
117924 | for (size_t horizontal_index = 0; horizontal_index < margin; |
| 218 | 85128 | ++horizontal_index) { | |
| 219 | 85128 | size_t index = width - margin + horizontal_index; | |
| 220 | 85128 | auto offsets = horizontal_border.offsets_with_right_border(index); | |
| 221 | 170256 | filter.process_horizontal_borders(buffer_rows.at(0, index), | |
| 222 | 85128 | dst_rows.at(0, index), offsets); | |
| 223 | 85128 | } | |
| 224 | 32796 | } | |
| 225 | |||
| 226 | protected: | ||
| 227 | Rectangle rect_; | ||
| 228 | size_t channels_; | ||
| 229 | std::unique_ptr<uint8_t, decltype(&std::free)> buffer_; | ||
| 230 | size_t buffer_rows_stride_; | ||
| 231 | }; // end of class SeparableFilterWorkspace | ||
| 232 | |||
| 233 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
| 234 | |||
| 235 | #endif // KLEIDICV_WORKSPACE_SEPARABLE_H | ||
| 236 |