KleidiCV Coverage Report


Directory: ./
File: kleidicv/include/kleidicv/morphology/workspace.h
Date: 2026-03-05 15:57:40
Exec Total Coverage
Lines: 207 207 100.0%
Functions: 47 50 94.0%
Branches: 81 85 95.3%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_MORPHOLOGY_WORKSPACE_H
6 #define KLEIDICV_MORPHOLOGY_WORKSPACE_H
7
8 #include <algorithm>
9 #include <array>
10 #include <cstddef>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <memory>
14 #include <optional>
15 #include <variant>
16
17 #include "kleidicv/kleidicv.h"
18 #include "kleidicv/types.h"
19
20 #if KLEIDICV_TARGET_SME
21 #include <arm_sme.h>
22 #endif
23
24 namespace KLEIDICV_TARGET_NAMESPACE {
25
26 // Workspace for morphological operations.
27 class MorphologyWorkspace final {
28 public:
29 enum class BorderType {
30 CONSTANT,
31 REPLICATE,
32 };
33
34 1472 static std::optional<BorderType> get_border_type(
35 kleidicv_border_type_t border_type) KLEIDICV_STREAMING {
36
3/3
✓ Branch 0 taken 424 times.
✓ Branch 1 taken 40 times.
✓ Branch 2 taken 1008 times.
1472 switch (border_type) {
37 case KLEIDICV_BORDER_TYPE_REPLICATE:
38 1008 return BorderType::REPLICATE;
39 case KLEIDICV_BORDER_TYPE_CONSTANT:
40 424 return BorderType::CONSTANT;
41 default:
42 40 return std::optional<BorderType>();
43 }
44 1472 }
45
46 template <typename T>
47 class CopyDataMemcpy {
48 public:
49 4744 constexpr void operator()(Rows<const T> src_rows, Rows<T> dst_rows,
50 size_t length) const KLEIDICV_STREAMING {
51 #if KLEIDICV_TARGET_SME
52 __arm_sc_memcpy(static_cast<void *>(&dst_rows[0]),
53 static_cast<const void *>(&src_rows[0]),
54 length * sizeof(T) * dst_rows.channels());
55 #else
56 9488 std::memcpy(static_cast<void *>(&dst_rows[0]),
57 4744 static_cast<const void *>(&src_rows[0]),
58 4744 length * sizeof(T) * dst_rows.channels());
59 #endif
60 4744 }
61 };
62
63 // MorphologyWorkspace is only constructible with create().
64 MorphologyWorkspace() = delete;
65
66 1368 static std::variant<MorphologyWorkspace, kleidicv_error_t> create(
67 Rectangle kernel, Point anchor, BorderType border_type,
68 const uint8_t *border_value, size_t channels, size_t type_size,
69 Rectangle image_size) KLEIDICV_STREAMING {
70
4/4
✓ Branch 0 taken 1352 times.
✓ Branch 1 taken 16 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 1344 times.
1368 if (anchor.x() >= kernel.width() || anchor.y() >= kernel.height()) {
71 24 return KLEIDICV_ERROR_RANGE;
72 }
73
74
4/4
✓ Branch 0 taken 424 times.
✓ Branch 1 taken 920 times.
✓ Branch 2 taken 416 times.
✓ Branch 3 taken 8 times.
1344 if (border_type == BorderType::CONSTANT && border_value == nullptr) {
75 8 return KLEIDICV_ERROR_NULL_POINTER;
76 }
77
78 // These values are arbitrarily choosen.
79 2672 const size_t rows_per_iteration =
80 1336 std::max(2 * kernel.height(), static_cast<size_t>(32ULL));
81 // To avoid load/store penalties.
82 1336 const size_t kAlignment = 16;
83 1336 Margin margin{kernel, anchor};
84
85 // A single wide row which can hold one row worth of data in addition
86 // to left and right margins.
87 2672 size_t wide_rows_width =
88 1336 margin.left() + image_size.width() + margin.right();
89 1336 size_t wide_rows_stride = wide_rows_width * channels;
90 1336 wide_rows_stride = align_up(wide_rows_stride, kAlignment);
91 1336 size_t wide_rows_height = 1UL; // There is only one wide row.
92 1336 size_t wide_rows_size = wide_rows_stride * wide_rows_height;
93 1336 wide_rows_size += kAlignment - 1;
94
95 // Multiple buffer rows to hold rows without any borders.
96 1336 size_t buffer_rows_width = type_size * image_size.width();
97 1336 size_t buffer_rows_stride = buffer_rows_width * channels;
98 1336 buffer_rows_stride = align_up(buffer_rows_stride, kAlignment);
99 1336 size_t buffer_rows_height = 2 * rows_per_iteration;
100 1336 size_t buffer_rows_size = 0UL;
101
2/2
✓ Branch 0 taken 8 times.
✓ Branch 1 taken 1328 times.
1336 if (__builtin_mul_overflow(buffer_rows_stride, buffer_rows_height,
102 &buffer_rows_size)) {
103 8 return KLEIDICV_ERROR_RANGE;
104 }
105 1328 buffer_rows_size += kAlignment - 1;
106
107 // Storage for indirect row access.
108 1328 size_t indirect_row_storage_size = 3 * rows_per_iteration * sizeof(void *);
109
110 // Try to allocate the buffers at once.
111 2656 size_t allocation_size =
112 1328 indirect_row_storage_size + buffer_rows_size + wide_rows_size;
113 2656 uint8_t *allocation =
114 1328 reinterpret_cast<uint8_t *>(std::malloc(allocation_size));
115
2/2
✓ Branch 0 taken 1320 times.
✓ Branch 1 taken 8 times.
1328 if (!allocation) {
116 8 return KLEIDICV_ERROR_ALLOCATION;
117 }
118
119 1320 size_t wide_rows_src_width = image_size.width();
120
121 1320 auto *buffer_rows_address = &allocation[indirect_row_storage_size];
122 1320 buffer_rows_address = align_up(buffer_rows_address, kAlignment);
123 1320 ptrdiff_t buffer_rows_offset = buffer_rows_address - allocation;
124
125 2640 auto *wide_rows_address =
126 1320 &allocation[indirect_row_storage_size + buffer_rows_size];
127 1320 wide_rows_address += margin.left() * channels;
128 1320 wide_rows_address = align_up(wide_rows_address, kAlignment);
129 1320 wide_rows_address -= margin.left() * channels;
130 1320 ptrdiff_t wide_rows_offset = wide_rows_address - allocation;
131
132 1320 std::array<uint8_t, KLEIDICV_MAXIMUM_CHANNEL_COUNT> border_values{};
133
2/2
✓ Branch 0 taken 904 times.
✓ Branch 1 taken 416 times.
1320 if (border_type == BorderType::CONSTANT) {
134
2/2
✓ Branch 0 taken 584 times.
✓ Branch 1 taken 416 times.
1000 for (size_t i = 0; i < channels; ++i) {
135 584 border_values[i] = border_value[i];
136 584 }
137 416 }
138
139 2640 return MorphologyWorkspace{image_size,
140 1320 margin,
141 1320 border_type,
142 1320 border_values,
143 1320 rows_per_iteration,
144 1320 wide_rows_src_width,
145 1320 channels,
146 0,
147 0,
148 1320 buffer_rows_offset,
149 1320 buffer_rows_stride,
150 1320 wide_rows_offset,
151 1320 wide_rows_stride,
152 1320 allocation};
153 1368 }
154
155 private:
156 1320 MorphologyWorkspace(
157 Rectangle image_size, Margin margin, BorderType border_type,
158 std::array<uint8_t, KLEIDICV_MAXIMUM_CHANNEL_COUNT> border_values,
159 size_t rows_per_iteration, size_t wide_rows_src_width, size_t channels,
160 size_t horizontal_height, size_t vertical_height,
161 ptrdiff_t buffer_rows_offset, size_t buffer_rows_stride,
162 ptrdiff_t wide_rows_offset, size_t wide_rows_stride,
163 uint8_t *allocation) KLEIDICV_STREAMING
164 1320 : image_size_{image_size},
165 1320 margin_{margin},
166 1320 border_type_{border_type},
167 1320 border_value_{border_values},
168 1320 rows_per_iteration_{rows_per_iteration},
169 1320 wide_rows_src_width_{wide_rows_src_width},
170 1320 channels_{channels},
171 1320 horizontal_height_{horizontal_height},
172 1320 vertical_height_{vertical_height},
173 1320 buffer_rows_offset_{buffer_rows_offset},
174 1320 buffer_rows_stride_{buffer_rows_stride},
175 1320 wide_rows_offset_{wide_rows_offset},
176 1320 wide_rows_stride_{wide_rows_stride},
177 1320 data_{allocation, &std::free} {}
178
179 public:
180 // This function is too complex, but disable the warning for now.
181 // NOLINTBEGIN(readability-function-cognitive-complexity)
182 template <typename O>
183 1512 void process(Rows<const typename O::SourceType> src_rows,
184 Rows<typename O::DestinationType> dst_rows,
185 O operation) KLEIDICV_STREAMING {
186 using S = typename O::SourceType;
187 using B = typename O::BufferType;
188 1512 typename O::CopyData copy_data{};
189
190
8/8
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 816 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 812 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 688 times.
✓ Branch 6 taken 8 times.
✓ Branch 7 taken 684 times.
1512 if (KLEIDICV_UNLIKELY(image_size_.width() == 0 ||
191 1504 image_size_.height() == 0)) {
192 16 return;
193 }
194
195 // Wide rows which can hold data with left and right margins.
196 1496 auto wide_rows =
197 2992 Rows{reinterpret_cast<S *>(&data_.get()[wide_rows_offset_]),
198 1496 wide_rows_stride_, channels_};
199
200 // Double buffered indirect rows to access the buffer rows.
201 2992 auto db_indirect_rows = DoubleBufferedIndirectRows{
202 1496 reinterpret_cast<B **>(&data_.get()[0]), rows_per_iteration_,
203 2992 Rows{reinterpret_cast<B *>(&data_.get()[buffer_rows_offset_]),
204 1496 buffer_rows_stride_, channels_}};
205
206 // [Step 1] Initialize workspace.
207 1496 horizontal_height_ =
208 1496 margin_.top() + image_size_.height() + margin_.bottom();
209 1496 vertical_height_ = image_size_.height();
210 1496 size_t row_index = 0;
211
212 // Used by replicate border type.
213 1496 auto first_src_rows = src_rows;
214 1496 auto last_src_rows = src_rows.at(image_size_.height() - 1);
215
216 1496 size_t horizontal_height = get_next_horizontal_height();
217
4/4
✓ Branch 0 taken 6244 times.
✓ Branch 1 taken 812 times.
✓ Branch 2 taken 4500 times.
✓ Branch 3 taken 684 times.
12240 for (size_t index = 0; index < horizontal_height; ++index) {
218
4/6
✗ Branch 0 not taken.
✓ Branch 1 taken 1268 times.
✓ Branch 2 taken 4976 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 1268 times.
✓ Branch 5 taken 3232 times.
10744 switch (border_type_) {
219 case BorderType::CONSTANT: {
220 2536 make_constant_border(wide_rows, 0, margin_.left());
221
222
8/8
✓ Branch 0 taken 1076 times.
✓ Branch 1 taken 192 times.
✓ Branch 2 taken 312 times.
✓ Branch 3 taken 764 times.
✓ Branch 4 taken 948 times.
✓ Branch 5 taken 320 times.
✓ Branch 6 taken 184 times.
✓ Branch 7 taken 764 times.
2536 if (row_index < margin_.top() ||
223 2024 row_index >= margin_.top() + image_size_.height()) {
224 2016 make_constant_border(wide_rows, margin_.left(),
225 1008 wide_rows_src_width_);
226 1008 } else {
227 3056 copy_data(src_rows, wide_rows.at(0, margin_.left()),
228 1528 wide_rows_src_width_);
229 // Advance source rows.
230 1528 ++src_rows;
231 }
232
233 5072 make_constant_border(wide_rows, margin_.left() + wide_rows_src_width_,
234 2536 margin_.right());
235
236 // Advance counters.
237 2536 ++row_index;
238 2536 } break;
239
240 case BorderType::REPLICATE: {
241 8208 Rows<const S> current_src_row;
242
243
4/4
✓ Branch 0 taken 1152 times.
✓ Branch 1 taken 3824 times.
✓ Branch 2 taken 608 times.
✓ Branch 3 taken 2624 times.
8208 if (row_index < margin_.top()) {
244 1760 current_src_row = first_src_rows;
245
4/4
✓ Branch 0 taken 2972 times.
✓ Branch 1 taken 852 times.
✓ Branch 2 taken 2092 times.
✓ Branch 3 taken 532 times.
8208 } else if (row_index < (margin_.top() + image_size_.height())) {
246 5064 current_src_row = src_rows;
247 // Advance source rows.
248 5064 ++src_rows;
249 5064 } else {
250 1384 current_src_row = last_src_rows;
251 }
252
253 8208 replicate_border(current_src_row, wide_rows, 0, 0, margin_.left());
254 16416 copy_data(current_src_row, wide_rows.at(0, margin_.left()),
255 8208 wide_rows_src_width_);
256 16416 replicate_border(current_src_row, wide_rows, wide_rows_src_width_ - 1,
257 8208 margin_.left() + wide_rows_src_width_,
258 8208 margin_.right());
259
260 // Advance counters.
261 8208 ++row_index;
262 8208 } break;
263 } // switch (border_type_)
264
265 // [Step 2] Process the preloaded data.
266 21488 operation.process_horizontal(Rectangle{image_size_.width(), 1UL},
267 10744 wide_rows,
268 10744 db_indirect_rows.write_at().at(index));
269 10744 } // for (...; index < horizontal_height; ...)
270
271 1496 db_indirect_rows.swap();
272
273 // [Step 3] Process any remaining data.
274
4/4
✓ Branch 0 taken 864 times.
✓ Branch 1 taken 812 times.
✓ Branch 2 taken 736 times.
✓ Branch 3 taken 684 times.
3096 while (vertical_height_) {
275 1600 size_t horizontal_height = get_next_horizontal_height();
276
4/4
✓ Branch 0 taken 1228 times.
✓ Branch 1 taken 864 times.
✓ Branch 2 taken 1228 times.
✓ Branch 3 taken 736 times.
4056 for (size_t index = 0; index < horizontal_height; ++index) {
277
4/6
✗ Branch 0 not taken.
✓ Branch 1 taken 408 times.
✓ Branch 2 taken 820 times.
✗ Branch 3 not taken.
✓ Branch 4 taken 408 times.
✓ Branch 5 taken 820 times.
2456 switch (border_type_) {
278 case BorderType::CONSTANT: {
279
4/4
✓ Branch 0 taken 400 times.
✓ Branch 1 taken 8 times.
✓ Branch 2 taken 400 times.
✓ Branch 3 taken 8 times.
816 if (row_index < (margin_.top() + image_size_.height())) {
280 // Constant left and right borders with source data.
281 1600 copy_data(src_rows, wide_rows.at(0, margin_.left()),
282 800 wide_rows_src_width_);
283 // Advance source rows.
284 800 ++src_rows;
285 800 } else {
286 32 make_constant_border(wide_rows, margin_.left(),
287 16 wide_rows_src_width_);
288 }
289
290 // Advance row counter.
291 816 ++row_index;
292 816 } break;
293
294 case BorderType::REPLICATE: {
295 1640 Rows<const S> current_src_row;
296
297
4/4
✓ Branch 0 taken 800 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 800 times.
✓ Branch 3 taken 20 times.
1640 if (row_index < (margin_.top() + image_size_.height())) {
298 1600 current_src_row = src_rows;
299 // Advance source rows.
300 1600 ++src_rows;
301 1600 } else {
302 40 current_src_row = last_src_rows;
303 }
304
305 1640 replicate_border(current_src_row, wide_rows, 0, 0, margin_.left());
306 3280 copy_data(current_src_row, wide_rows.at(0, margin_.left()),
307 1640 wide_rows_src_width_);
308 1640 replicate_border(
309 1640 current_src_row, wide_rows, wide_rows_src_width_ - 1,
310 1640 margin_.left() + wide_rows_src_width_, margin_.right());
311
312 // Advance counters.
313 1640 ++row_index;
314 1640 } break;
315 } // switch (border_type_)
316
317 4912 operation.process_horizontal(Rectangle{image_size_.width(), 1UL},
318 2456 wide_rows,
319 2456 db_indirect_rows.write_at().at(index));
320 2456 } // for (...; index < horizontal_height; ...)
321
322 1600 size_t next_vertical_height = get_next_vertical_height();
323 1600 operation.process_vertical(
324 1600 Rectangle{image_size_.width(), next_vertical_height},
325 1600 db_indirect_rows.read_at(), dst_rows);
326 1600 dst_rows += next_vertical_height;
327
328 1600 db_indirect_rows.swap();
329 1600 }
330 1512 }
331 // NOLINTEND(readability-function-cognitive-complexity)
332
333 private:
334 // The number of wide rows to process in the next iteration.
335 3096 [[nodiscard]] size_t get_next_horizontal_height() KLEIDICV_STREAMING {
336 3096 size_t height = std::min(horizontal_height_, rows_per_iteration_);
337 3096 horizontal_height_ -= height;
338 6192 return height;
339 3096 }
340
341 // The number of indirect rows to process in the next iteration.
342 1600 [[nodiscard]] size_t get_next_vertical_height() KLEIDICV_STREAMING {
343 1600 size_t height = std::min(vertical_height_, rows_per_iteration_);
344 1600 vertical_height_ -= height;
345 3200 return height;
346 1600 }
347
348 template <typename T>
349 6096 void make_constant_border(Rows<T> dst_rows, size_t dst_index,
350 size_t count) KLEIDICV_STREAMING {
351 6096 auto dst = &dst_rows.at(0, dst_index)[0];
352
2/2
✓ Branch 0 taken 6096 times.
✓ Branch 1 taken 20528 times.
26624 for (size_t index = 0; index < count; ++index) {
353
2/2
✓ Branch 0 taken 22832 times.
✓ Branch 1 taken 20528 times.
43360 for (size_t channel = 0; channel < dst_rows.channels(); ++channel) {
354 22832 dst[index * dst_rows.channels() + channel] = border_value_[channel];
355 22832 }
356 20528 }
357 6096 }
358
359 template <typename T>
360 19696 void replicate_border(Rows<const T> src_rows, Rows<T> dst_rows,
361 size_t src_index, size_t dst_index,
362 size_t count) KLEIDICV_STREAMING {
363
2/2
✓ Branch 0 taken 15256 times.
✓ Branch 1 taken 4440 times.
19696 if (!count) {
364 4440 return;
365 }
366
367
2/2
✓ Branch 0 taken 15256 times.
✓ Branch 1 taken 21912 times.
37168 for (size_t channel = 0; channel < src_rows.channels(); ++channel) {
368
2/2
✓ Branch 0 taken 39300 times.
✓ Branch 1 taken 21912 times.
61212 for (size_t index = dst_index; index < dst_index + count; ++index) {
369 39300 dst_rows.at(0, index)[channel] = src_rows.at(0, src_index)[channel];
370 39300 }
371 21912 }
372 19696 }
373
374 Rectangle image_size_;
375 Margin margin_;
376 BorderType border_type_;
377 std::array<uint8_t, KLEIDICV_MAXIMUM_CHANNEL_COUNT> border_value_;
378
379 // Number of wide rows in this workspace.
380 size_t rows_per_iteration_;
381 // Size of the data in bytes within a row.
382 size_t wide_rows_src_width_;
383 // The number of channels.
384 size_t channels_;
385 // Remaining height to process in horizontal direction.
386 size_t horizontal_height_;
387 // Remaining height to process in vertical direction.
388 size_t vertical_height_;
389 // Offset in bytes to the buffer rows from &data_[0].
390 ptrdiff_t buffer_rows_offset_;
391 // Stride of the buffer rows.
392 size_t buffer_rows_stride_;
393 // Offset in bytes to the wide rows from &data_[0].
394 ptrdiff_t wide_rows_offset_;
395 // Stride of the wide rows.
396 size_t wide_rows_stride_;
397 // Workspace buffer
398 std::unique_ptr<uint8_t, decltype(&std::free)> data_;
399 }; // end of class MorphologyWorkspace
400
401 } // namespace KLEIDICV_TARGET_NAMESPACE
402
403 #endif // KLEIDICV_MORPHOLOGY_WORKSPACE_H
404