KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/morphology/workspace.h
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 198 198 100.0%
Functions: 57 57 100.0%
Branches: 79 83 95.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_MORPHOLOGY_WORKSPACE_H
6 #define KLEIDICV_MORPHOLOGY_WORKSPACE_H
7
8 #include <algorithm>
9 #include <array>
10 #include <cstdlib>
11 #include <memory>
12 #include <optional>
13
14 #include "kleidicv/kleidicv.h"
15 #include "kleidicv/types.h"
16
17 #if KLEIDICV_TARGET_SME
18 #include <arm_sme.h>
19 #endif
20
21 namespace KLEIDICV_TARGET_NAMESPACE {
22
23 // Forward declarations.
24 class MorphologyWorkspace;
25
26 // Deleter for MorphologyWorkspace instances.
27 class MorphologyWorkspaceDeleter {
28 public:
29 1312 void operator()(MorphologyWorkspace *ptr) const KLEIDICV_STREAMING {
30 1312 std::free(ptr);
31 1312 };
32 };
33
34 // Workspace for morphological operations.
35 class MorphologyWorkspace final {
36 public:
37 // Shorthand for std::unique_ptr<> holding a workspace.
38 using Pointer =
39 std::unique_ptr<MorphologyWorkspace, MorphologyWorkspaceDeleter>;
40
41 enum class BorderType {
42 CONSTANT,
43 REPLICATE,
44 };
45
46 1352 static std::optional<BorderType> get_border_type(
47 kleidicv_border_type_t border_type) KLEIDICV_STREAMING {
48
3/3
✓ Branch 0 taken 388 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 944 times.
1352 switch (border_type) {
49 case KLEIDICV_BORDER_TYPE_REPLICATE:
50 944 return BorderType::REPLICATE;
51 case KLEIDICV_BORDER_TYPE_CONSTANT:
52 388 return BorderType::CONSTANT;
53 default:
54 20 return std::optional<BorderType>();
55 }
56 1352 }
57
58 template <typename T>
59 class CopyDataMemcpy {
60 public:
61 4740 constexpr void operator()(Rows<const T> src_rows, Rows<T> dst_rows,
62 size_t length) const KLEIDICV_STREAMING {
63 #if KLEIDICV_TARGET_SME
64 __arm_sc_memcpy(static_cast<void *>(&dst_rows[0]),
65 static_cast<const void *>(&src_rows[0]),
66 length * sizeof(T) * dst_rows.channels());
67 #else
68 9480 std::memcpy(static_cast<void *>(&dst_rows[0]),
69 4740 static_cast<const void *>(&src_rows[0]),
70 4740 length * sizeof(T) * dst_rows.channels());
71 #endif
72 4740 }
73 };
74
75 // MorphologyWorkspace is only constructible with create().
76 MorphologyWorkspace() = delete;
77
78 // Creates a workspace on the heap.
79 1332 static kleidicv_error_t create(
80 Pointer &workspace, kleidicv_rectangle_t kernel, kleidicv_point_t anchor,
81 BorderType border_type, const uint8_t *border_value, size_t channels,
82 size_t iterations, size_t type_size,
83 kleidicv_rectangle_t image) KLEIDICV_STREAMING {
84 // These values are arbitrarily choosen.
85 2664 const size_t rows_per_iteration =
86 1332 std::max(2 * kernel.height, static_cast<size_t>(32ULL));
87 // To avoid load/store penalties.
88 1332 const size_t kAlignment = 16;
89
90
4/4
✓ Branch 0 taken 1324 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 1320 times.
1332 if (anchor.x >= kernel.width || anchor.y >= kernel.height) {
91 12 return KLEIDICV_ERROR_RANGE;
92 }
93
94 1320 Rectangle image_size{image};
95 1320 Margin margin{kernel, anchor};
96
97 // A single wide row which can hold one row worth of data in addition
98 // to left and right margins.
99 2640 size_t wide_rows_width =
100 1320 margin.left() + image_size.width() + margin.right();
101 1320 size_t wide_rows_stride = wide_rows_width * channels;
102 1320 wide_rows_stride = align_up(wide_rows_stride, kAlignment);
103 1320 size_t wide_rows_height = 1UL; // There is only one wide row.
104 1320 size_t wide_rows_size = wide_rows_stride * wide_rows_height;
105 1320 wide_rows_size += kAlignment - 1;
106
107 // Multiple buffer rows to hold rows without any borders.
108 1320 size_t buffer_rows_width = type_size * image_size.width();
109 1320 size_t buffer_rows_stride = buffer_rows_width * channels;
110 1320 buffer_rows_stride = align_up(buffer_rows_stride, kAlignment);
111 1320 size_t buffer_rows_height = 2 * rows_per_iteration;
112 1320 size_t buffer_rows_size = 0UL;
113
2/2
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 1316 times.
1320 if (__builtin_mul_overflow(buffer_rows_stride, buffer_rows_height,
114 &buffer_rows_size)) {
115 4 return KLEIDICV_ERROR_RANGE;
116 }
117 1316 buffer_rows_size += kAlignment - 1;
118
119 // Storage for indirect row access.
120 1316 size_t indirect_row_storage_size = 3 * rows_per_iteration * sizeof(void *);
121
122 // Try to allocate workspace at once.
123 3948 size_t allocation_size = sizeof(MorphologyWorkspace) +
124 3948 indirect_row_storage_size + buffer_rows_size +
125 1316 wide_rows_size;
126 1316 void *allocation = std::malloc(allocation_size);
127 2632 workspace = MorphologyWorkspace::Pointer{
128 1316 reinterpret_cast<MorphologyWorkspace *>(allocation)};
129
2/2
✓ Branch 0 taken 1312 times.
✓ Branch 1 taken 4 times.
1316 if (!workspace) {
130 4 return KLEIDICV_ERROR_ALLOCATION;
131 }
132
133 1312 workspace->rows_per_iteration_ = rows_per_iteration;
134 1312 workspace->wide_rows_src_width_ = image_size.width();
135 1312 workspace->channels_ = channels;
136
137 1312 auto *buffer_rows_address = &workspace->data_[indirect_row_storage_size];
138 1312 buffer_rows_address = align_up(buffer_rows_address, kAlignment);
139 1312 workspace->buffer_rows_offset_ = buffer_rows_address - &workspace->data_[0];
140 1312 workspace->buffer_rows_stride_ = buffer_rows_stride;
141
142 2624 auto *wide_rows_address =
143 1312 &workspace->data_[indirect_row_storage_size + buffer_rows_size];
144 1312 wide_rows_address += margin.left() * channels;
145 1312 wide_rows_address = align_up(wide_rows_address, kAlignment);
146 1312 wide_rows_address -= margin.left() * channels;
147 1312 workspace->wide_rows_offset_ = wide_rows_address - &workspace->data_[0];
148 1312 workspace->wide_rows_stride_ = wide_rows_stride;
149
150 1312 workspace->kernel_ = kernel;
151 1312 workspace->anchor_ = anchor;
152 1312 workspace->border_type_ = border_type;
153
2/2
✓ Branch 0 taken 924 times.
✓ Branch 1 taken 388 times.
1312 if (border_type == BorderType::CONSTANT) {
154
2/2
✓ Branch 0 taken 384 times.
✓ Branch 1 taken 4 times.
388 if (border_value == nullptr) {
155 4 return KLEIDICV_ERROR_NULL_POINTER;
156 }
157
2/2
✓ Branch 0 taken 552 times.
✓ Branch 1 taken 384 times.
936 for (size_t i = 0; i < channels; ++i) {
158 552 workspace->border_value_[i] = border_value[i];
159 552 }
160 384 }
161 1308 workspace->channels_ = channels;
162 1308 workspace->iterations_ = iterations;
163 1308 workspace->type_size_ = type_size;
164 1308 workspace->image_size_ = image_size;
165
166 1308 return KLEIDICV_OK;
167 1332 }
168
169 2672 kleidicv_rectangle_t kernel() const { return kernel_; }
170 1312 kleidicv_point_t anchor() const { return anchor_; }
171 1504 BorderType border_type() const { return border_type_; }
172 2624 size_t channels() const { return channels_; }
173 2816 size_t iterations() const { return iterations_; }
174 1384 size_t type_size() const { return type_size_; }
175 1376 Rectangle image_size() const { return image_size_; }
176
177 // This function is too complex, but disable the warning for now.
178 // NOLINTBEGIN(readability-function-cognitive-complexity)
179 template <typename O>
180 1504 void process(Rectangle rect, Rows<const typename O::SourceType> src_rows,
181 Rows<typename O::DestinationType> dst_rows, Margin margin,
182 BorderType border_type, O operation) KLEIDICV_STREAMING {
183 using S = typename O::SourceType;
184 using B = typename O::BufferType;
185 1504 typename O::CopyData copy_data{};
186
187
8/8
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 812 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 808 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 684 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 680 times.
1504 if (KLEIDICV_UNLIKELY(rect.width() == 0 || rect.height() == 0)) {
188 16 return;
189 }
190
191 // Wide rows which can hold data with left and right margins.
192 2976 auto wide_rows = Rows{reinterpret_cast<S *>(&data_[wide_rows_offset_]),
193 1488 wide_rows_stride_, channels_};
194
195 // Double buffered indirect rows to access the buffer rows.
196 2976 auto db_indirect_rows = DoubleBufferedIndirectRows{
197 1488 reinterpret_cast<B **>(&data_[0]), rows_per_iteration_,
198 2976 Rows{reinterpret_cast<B *>(&data_[buffer_rows_offset_]),
199 1488 buffer_rows_stride_, channels_}};
200
201 // [Step 1] Initialize workspace.
202 1488 horizontal_height_ = margin.top() + rect.height() + margin.bottom();
203 1488 vertical_height_ = rect.height();
204 1488 row_index_ = 0;
205
206 // Used by replicate border type.
207 1488 auto first_src_rows = src_rows;
208 1488 auto last_src_rows = src_rows.at(rect.height() - 1);
209
210 1488 size_t horizontal_height = get_next_horizontal_height();
211
4/4
✓ Branch 0 taken 6240 times.
✓ Branch 1 taken 808 times.
✓ Branch 2 taken 4496 times.
✓ Branch 3 taken 680 times.
12224 for (size_t index = 0; index < horizontal_height; ++index) {
212
4/6
✗ Branch 0 not taken.
✓ Branch 1 taken 1268 times.
✓ Branch 2 taken 4972 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1268 times.
✓ Branch 5 taken 3228 times.
10736 switch (border_type) {
213 case BorderType::CONSTANT: {
214 2536 make_constant_border(wide_rows, 0, margin.left());
215
216
8/8
✓ Branch 0 taken 1076 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 312 times.
✓ Branch 3 taken 764 times.
✓ Branch 4 taken 948 times.
✓ Branch 5 taken 320 times.
✓ Branch 6 taken 184 times.
✓ Branch 7 taken 764 times.
2536 if (row_index_ < margin.top() ||
217 2024 row_index_ >= margin.top() + rect.height()) {
218 2016 make_constant_border(wide_rows, margin.left(),
219 1008 wide_rows_src_width_);
220 1008 } else {
221 3056 copy_data(src_rows, wide_rows.at(0, margin.left()),
222 1528 wide_rows_src_width_);
223 // Advance source rows.
224 1528 ++src_rows;
225 }
226
227 5072 make_constant_border(wide_rows, margin.left() + wide_rows_src_width_,
228 2536 margin.right());
229
230 // Advance counters.
231 2536 ++row_index_;
232 2536 } break;
233
234 case BorderType::REPLICATE: {
235 8200 Rows<const S> current_src_row;
236
237
4/4
✓ Branch 0 taken 1152 times.
✓ Branch 1 taken 3820 times.
✓ Branch 2 taken 608 times.
✓ Branch 3 taken 2620 times.
8200 if (row_index_ < margin.top()) {
238 1760 current_src_row = first_src_rows;
239
4/4
✓ Branch 0 taken 2968 times.
✓ Branch 1 taken 852 times.
✓ Branch 2 taken 2088 times.
✓ Branch 3 taken 532 times.
8200 } else if (row_index_ < (margin.top() + rect.height())) {
240 5056 current_src_row = src_rows;
241 // Advance source rows.
242 5056 ++src_rows;
243 5056 } else {
244 1384 current_src_row = last_src_rows;
245 }
246
247 8200 replicate_border(current_src_row, wide_rows, 0, 0, margin.left());
248 16400 copy_data(current_src_row, wide_rows.at(0, margin.left()),
249 8200 wide_rows_src_width_);
250 16400 replicate_border(current_src_row, wide_rows, wide_rows_src_width_ - 1,
251 8200 margin.left() + wide_rows_src_width_,
252 8200 margin.right());
253
254 // Advance counters.
255 8200 ++row_index_;
256 8200 } break;
257 } // switch (border_type)
258
259 // [Step 2] Process the preloaded data.
260 21472 operation.process_horizontal(Rectangle{rect.width(), 1UL}, wide_rows,
261 10736 db_indirect_rows.write_at().at(index));
262 10736 } // for (...; index < horizontal_height; ...)
263
264 1488 db_indirect_rows.swap();
265
266 // [Step 3] Process any remaining data.
267
4/4
✓ Branch 0 taken 860 times.
✓ Branch 1 taken 808 times.
✓ Branch 2 taken 732 times.
✓ Branch 3 taken 680 times.
3080 while (vertical_height_) {
268 1592 size_t horizontal_height = get_next_horizontal_height();
269
4/4
✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 860 times.
✓ Branch 2 taken 1228 times.
✓ Branch 3 taken 732 times.
4048 for (size_t index = 0; index < horizontal_height; ++index) {
270
4/6
✗ Branch 0 not taken.
✓ Branch 1 taken 408 times.
✓ Branch 2 taken 820 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 408 times.
✓ Branch 5 taken 820 times.
2456 switch (border_type) {
271 case BorderType::CONSTANT: {
272
4/4
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 400 times.
✓ Branch 3 taken 8 times.
816 if (row_index_ < (margin.top() + rect.height())) {
273 // Constant left and right borders with source data.
274 1600 copy_data(src_rows, wide_rows.at(0, margin.left()),
275 800 wide_rows_src_width_);
276 // Advance source rows.
277 800 ++src_rows;
278 800 } else {
279 32 make_constant_border(wide_rows, margin.left(),
280 16 wide_rows_src_width_);
281 }
282
283 // Advance row counter.
284 816 ++row_index_;
285 816 } break;
286
287 case BorderType::REPLICATE: {
288 1640 Rows<const S> current_src_row;
289
290
4/4
✓ Branch 0 taken 800 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 800 times.
✓ Branch 3 taken 20 times.
1640 if (row_index_ < (margin.top() + rect.height())) {
291 1600 current_src_row = src_rows;
292 // Advance source rows.
293 1600 ++src_rows;
294 1600 } else {
295 40 current_src_row = last_src_rows;
296 }
297
298 1640 replicate_border(current_src_row, wide_rows, 0, 0, margin.left());
299 3280 copy_data(current_src_row, wide_rows.at(0, margin.left()),
300 1640 wide_rows_src_width_);
301 1640 replicate_border(
302 1640 current_src_row, wide_rows, wide_rows_src_width_ - 1,
303 1640 margin.left() + wide_rows_src_width_, margin.right());
304
305 // Advance counters.
306 1640 ++row_index_;
307 1640 } break;
308 } // switch (border_type)
309
310 4912 operation.process_horizontal(Rectangle{rect.width(), 1UL}, wide_rows,
311 2456 db_indirect_rows.write_at().at(index));
312 2456 } // for (...; index < horizontal_height; ...)
313
314 1592 size_t next_vertical_height = get_next_vertical_height();
315 3184 operation.process_vertical(Rectangle{rect.width(), next_vertical_height},
316 1592 db_indirect_rows.read_at(), dst_rows);
317 1592 dst_rows += next_vertical_height;
318
319 1592 db_indirect_rows.swap();
320 1592 }
321 1504 }
322 // NOLINTEND(readability-function-cognitive-complexity)
323
324 private:
325 // The number of wide rows to process in the next iteration.
326 3080 [[nodiscard]] size_t get_next_horizontal_height() KLEIDICV_STREAMING {
327 3080 size_t height = std::min(horizontal_height_, rows_per_iteration_);
328 3080 horizontal_height_ -= height;
329 6160 return height;
330 3080 }
331
332 // The number of indirect rows to process in the next iteration.
333 1592 [[nodiscard]] size_t get_next_vertical_height() KLEIDICV_STREAMING {
334 1592 size_t height = std::min(vertical_height_, rows_per_iteration_);
335 1592 vertical_height_ -= height;
336 3184 return height;
337 1592 }
338
339 template <typename T>
340 6096 void make_constant_border(Rows<T> dst_rows, size_t dst_index,
341 size_t count) KLEIDICV_STREAMING {
342 6096 auto dst = &dst_rows.at(0, dst_index)[0];
343
2/2
✓ Branch 0 taken 6096 times.
✓ Branch 1 taken 20528 times.
26624 for (size_t index = 0; index < count; ++index) {
344
2/2
✓ Branch 0 taken 22832 times.
✓ Branch 1 taken 20528 times.
43360 for (size_t channel = 0; channel < dst_rows.channels(); ++channel) {
345 22832 dst[index * dst_rows.channels() + channel] = border_value_[channel];
346 22832 }
347 20528 }
348 6096 }
349
350 template <typename T>
351 19680 void replicate_border(Rows<const T> src_rows, Rows<T> dst_rows,
352 size_t src_index, size_t dst_index,
353 size_t count) KLEIDICV_STREAMING {
354
2/2
✓ Branch 0 taken 15256 times.
✓ Branch 1 taken 4424 times.
19680 if (!count) {
355 4424 return;
356 }
357
358
2/2
✓ Branch 0 taken 15256 times.
✓ Branch 1 taken 21912 times.
37168 for (size_t channel = 0; channel < src_rows.channels(); ++channel) {
359
2/2
✓ Branch 0 taken 39300 times.
✓ Branch 1 taken 21912 times.
61212 for (size_t index = dst_index; index < dst_index + count; ++index) {
360 39300 dst_rows.at(0, index)[channel] = src_rows.at(0, src_index)[channel];
361 39300 }
362 21912 }
363 19680 }
364
365 static_assert(sizeof(Pointer) == sizeof(void *), "Unexpected type size");
366
367 kleidicv_rectangle_t kernel_;
368 kleidicv_point_t anchor_;
369 BorderType border_type_;
370 std::array<uint8_t, KLEIDICV_MAXIMUM_CHANNEL_COUNT> border_value_;
371 size_t iterations_;
372 size_t type_size_;
373 Rectangle image_size_;
374
375 // Number of wide rows in this workspace.
376 size_t rows_per_iteration_;
377 // Size of the data in bytes within a row.
378 size_t wide_rows_src_width_;
379 // The number of channels.
380 size_t channels_;
381 // Remaining height to process in horizontal direction.
382 size_t horizontal_height_;
383 // Remaining height to process in vertical direction.
384 size_t vertical_height_;
385 // Index of the processed row.
386 size_t row_index_;
387 // Offset in bytes to the buffer rows from &data_[0].
388 size_t buffer_rows_offset_;
389 // Stride of the buffer rows.
390 size_t buffer_rows_stride_;
391 // Offset in bytes to the wide rows from &data_[0].
392 size_t wide_rows_offset_;
393 // Stride of the wide rows.
394 size_t wide_rows_stride_;
395 // Workspace area begins here.
396 uint8_t data_[0] KLEIDICV_ATTR_ALIGNED(sizeof(void *));
397 }; // end of class MorphologyWorkspace
398
399 } // namespace KLEIDICV_TARGET_NAMESPACE
400
401 #endif // KLEIDICV_MORPHOLOGY_WORKSPACE_H
402