KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/analysis/min_max_loc_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 123 123 100.0%
Functions: 9 9 100.0%
Branches: 44 44 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <climits>
6 #include <limits>
7
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10
11 // This algorithm calculates the index of element in src array of the global
12 // minimum and maximum values.
13 //
14 // Theory of operation
15 //
16 // At most max_vectors_per_block() number of consecutive vector registers are
17 // processed with the vector path before performing some additional
18 // calculations. In every vector path iteration the algorithm remembers
19 // the following data per lane for the currently processed block:
20 // - [1] offsets of the vector where the last min/max value was seen, and
21 // - [2] overall min/max values.
22 //
23 // Block index
24 //
25 // Since our offsets only contain an element-size value, we need to
26 // extend this offset with a base value, to form a full index.
27 // This base value is the block's starting index:
28 //
29 // final index = block_index + offset * num_lanes_in_vector + lane_number
30 //
31 // Once a block is completed, min/max_block_indices are updated, and the
32 // per-lane offsets and min/max values are also copied to the global vectors,
33 // whichever lane has been updated.
34 //
35 // Which lane has been updated?
36 //
37 // Offsets are relative to block index, and are numbered from 1 to
38 // max_vectors_per_block(). At start of a block, offsets are zeroed,
39 // so after the block a non-zero check is enough to flag if an offset
40 // (and the min/max value) is updated.
41 //
42 // How is the update done?
43 //
44 // From the offset lanes, a single flag is made: are there any updates.
45 // If yes, then the min/max_block_index is copied from running_index
46 // (this is done effectively by a CSEL instruction).
47 // Min/max values and relative lane offsets are copied by counting a
48 // no_update vector lane-by-lane (comparing each lane to zero), so only the
49 // changed lanes are copied to the global min/max and offset vector. This is
50 // important to copy only the values that belong to the actual block index
51 // and are smaller/bigger than last global min/max.
52 // Otherwise we could get false results, see this example:
53 // vec: [ 4, 4, 4, 4] --> base_index: 0 offsets: 1 1 1 1
54 // vmax: 4, 4, 4, 4
55 // vec: [ 1,10, 2, 3] --> base_index: 1 new_offsets: 0 3 0 0
56 // vmax: 4,10, 4, 4 max_base_index: 1 --> offsets: 1 3 1 1
57 // vec: [10, 1, 1, 1] -->
58 // vmax:10,10, 4, 4 --- but we did not update the base index + offset!
59 // we would return base_index 1 offset 1, which is completely wrong
60 //
61 // Min/max values updated
62 //
63 // For all the above to work, we have to make sure we don't overwrite the
64 // global min/max values and offsets with an updated value of a non global
65 // min/max. To ensure this, after each block global min/max are calculated
66 // horizontally across from min/max per lane values, and these new global
67 // min/max values copied to all lanes and are used to compare values with
68 // in the next block.
69 //
70 // Example of 'min' for hypothetical 4 lanes and uint8_t type and
71 // max_block_size() = 2.
72 //
73 // Initial state:
74 // vrunning_offset_: [ 0x01, 0x01, 0x01, 0x01 ]
75 // vmin_offsets_: [ 0x00, 0x00, 0x00, 0x00 ]
76 // vmin_offsets_new_: [ 0x00, 0x00, 0x00, 0x00 ]
77 // vmin_new_: [ 0xFF, 0xFF, 0xFF, 0xFF ]
78 // vmin_: [ 0xFF, 0xFF, 0xFF, 0xFF ]
79 //
80 // 1st block:
81 // -> vector_path()
82 // vec: [ 0x12, 0xFF, 0x20, 0x42 ]
83 // is_smaller: [ 0xFF, 0x00, 0xFF, 0xFF ]
84 // vmin_offsets_new_: [ 0x01, 0x00, 0x01, 0x01 ]
85 // vmin_new_: [ 0x12, 0xFF, 0x20, 0x42 ]
86 // vrunning_offset_: [ 0x02, 0x02, 0x02, 0x02 ]
87 // -> vector_path()
88 // vec: [ 0x22, 0xFF, 0x55, 0x23 ]
89 // is_smaller: [ 0x00, 0x00, 0x00, 0xFF ]
90 // vmin_offsets_new_: [ 0x01, 0x00, 0x01, 0x02 ]
91 // vmin_new_: [ 0x12, 0xFF, 0x00, 0x23 ]
92 // vrunning_offset_: [ 0x03, 0x03, 0x03, 0x03 ]
93 // -> on_block_finished(1)
94 // vno_update: [ 0x00, 0xFF, 0x00, 0x00 ]
95 // vmin_offsets_: [ 0x01, 0x00, 0x01, 0x02 ]
96 // vmin_: [ 0x12, 0xFF, 0x20, 0x23 ]
97 // vmin_new_: [ 0x12, 0x12, 0x12, 0x12 ]
98
99 namespace kleidicv::neon {
100
101 template <typename ScalarType>
102 class MinMaxLoc final : public UnrollTwice {
103 public:
104 using VecTraits = neon::VecTraits<ScalarType>;
105 using VectorType = typename VecTraits::VectorType;
106 using UnsignedVectorType =
107 typename neon::VecTraits<std::make_unsigned_t<ScalarType>>::VectorType;
108
109 // NOLINTBEGIN(cppcoreguidelines-prefer-member-initializer)
110 592 MinMaxLoc() {
111 592 vmin_offsets_ = vmax_offsets_ = vmin_offsets_new_ = vmax_offsets_new_ =
112 592 vdupq_n(ScalarType{0});
113 592 vrunning_offset_ = vdupq_n(ScalarType{1});
114 592 vmin_ = vmin_new_ = vdupq_n(std::numeric_limits<ScalarType>::max());
115 592 vmax_ = vmax_new_ = vdupq_n(std::numeric_limits<ScalarType>::lowest());
116 592 min_block_index_ = max_block_index_ = 0;
117 592 min_vector_ = min_scalar_ = std::numeric_limits<ScalarType>::max();
118 592 max_vector_ = max_scalar_ = std::numeric_limits<ScalarType>::lowest();
119 592 min_scalar_index_ = max_scalar_index_ = 0;
120 592 running_index_ = 0;
121 592 }
122 // NOLINTEND(cppcoreguidelines-prefer-member-initializer)
123
124 6192 void vector_path(VectorType src) {
125 6192 VectorType v_is_smaller = vcltq_u8(src, vmin_new_);
126 6192 vmin_offsets_new_ =
127 6192 vbslq_u8(v_is_smaller, vrunning_offset_, vmin_offsets_new_);
128 6192 vmin_new_ = vminq_u8(vmin_new_, src);
129
130 6192 VectorType v_is_bigger = vcgtq_u8(src, vmax_new_);
131 6192 vmax_offsets_new_ =
132 6192 vbslq_u8(v_is_bigger, vrunning_offset_, vmax_offsets_new_);
133 6192 vmax_new_ = vmaxq_u8(vmax_new_, src);
134
135 6192 vrunning_offset_ = vaddq_u8(vrunning_offset_, vdupq_n(ScalarType{1}));
136 6192 }
137
138 19120 void scalar_path(ScalarType src) {
139
2/2
✓ Branch 0 taken 18332 times.
✓ Branch 1 taken 788 times.
19120 if (src < min_scalar_) {
140 788 min_scalar_ = src;
141 788 min_scalar_index_ = running_index_;
142 788 }
143
144
2/2
✓ Branch 0 taken 18270 times.
✓ Branch 1 taken 850 times.
19120 if (src > max_scalar_) {
145 850 max_scalar_ = src;
146 850 max_scalar_index_ = running_index_;
147 850 }
148
149 19120 ++running_index_;
150 19120 }
151
152 732 void on_block_finished(size_t vectors_in_block) {
153 732 min_vector_ = vminvq_u8(vmin_new_);
154 // Does any lane have a new min? Then update the block number
155 // 32-bit horizontal max is the fastest way to do a logical OR
156
2/2
✓ Branch 0 taken 500 times.
✓ Branch 1 taken 232 times.
732 min_block_index_ = vmaxvq_u32(vreinterpretq_u32_u8(vmin_offsets_new_))
157 500 ? running_index_
158 232 : min_block_index_;
159 // Which lane is zero, that was not updated in last block
160 732 uint8x16_t vno_update = vceqzq_u8(vmin_offsets_new_);
161 // Copy the updated (new min!) offsets into the ultimate offset vector
162 732 vmin_offsets_ = vbslq_u8(vno_update, vmin_offsets_, vmin_offsets_new_);
163 // Copy the updated new min values into the ultimate min vector
164 732 vmin_ = vbslq_u8(vno_update, vmin_, vmin_new_);
165 // Copy new min to all lanes, to prevent overwriting it with inferior values
166 732 vmin_new_ = vdupq_n(min_vector_);
167
168 // Just like min, but for max
169 732 max_vector_ = vmaxvq_u8(vmax_new_);
170
2/2
✓ Branch 0 taken 516 times.
✓ Branch 1 taken 216 times.
732 max_block_index_ = vmaxvq_u32(vreinterpretq_u32_u8(vmax_offsets_new_))
171 516 ? running_index_
172 216 : max_block_index_;
173 732 vno_update = vceqzq_u8(vmax_offsets_new_);
174 732 vmax_offsets_ = vbslq_u8(vno_update, vmax_offsets_, vmax_offsets_new_);
175 732 vmax_ = vbslq_u8(vno_update, vmax_, vmax_new_);
176 732 vmax_new_ = vdupq_n(max_vector_);
177
178 732 running_index_ += vectors_in_block * VecTraits::num_lanes();
179 // Starts from 1, so if an offset is updated, we can tell from it is nonzero
180 732 vrunning_offset_ = vdupq_n(ScalarType{1});
181 // Offsets are zeroed so we'll see when they updated
182 732 vmax_offsets_new_ = vmin_offsets_new_ = vdupq_n(ScalarType{0});
183 732 }
184
185 832 size_t max_vectors_per_block() const {
186 // -1: see description of vrunning_offset_
187 832 return std::numeric_limits<ScalarType>::max() - 1;
188 }
189
190 344 size_t min_index() const {
191 // Which minimum is the smaller one, scalar path or vector path?
192
2/2
✓ Branch 0 taken 116 times.
✓ Branch 1 taken 228 times.
344 if (min_scalar_ < min_vector_) {
193 116 return min_scalar_index_;
194 }
195
196 // To find the minimum location:
197 // 1. Get the lanes that have the same min value as the global min.
198 // 2. From the offsets of these lanes, find the smallest one.
199 // 3. From the smallest ones (it can happen if they were in the same vector)
200 // take the first one (LSB, because little endian)
201 // Example offsets: [ 0x01, 0x00, 0x01, 0x02, 0x01, 0x03 ]
202 // vmin_: [ 0x12, 0xFF, 0x20, 0x23, 0x12, 0x12 ]
203 // Fill not selected offsets with maximum value, they won't be the minimum
204 456 const uint8x16_t v_ignore = vdupq_n(static_cast<ScalarType>(
205 228 std::numeric_limits<std::make_unsigned_t<ScalarType>>::max()));
206 // [1] Test which lanes contain the global minimum value
207 // is_min: [ 0xFF, 0x00, 0x00, 0x00, 0xFF, 0xFF ]
208 228 VectorType is_min = vcleq(vmin_, vmin_new_);
209 // Select those offsets that contain the minimum
210 // selected_offsets: [ 0x01, 0xFF, 0xFF, 0xFF, 0x01, 0x03 ]
211 456 UnsignedVectorType v_selected_offsets =
212 228 vbslq_u8(is_min, vmin_offsets_, v_ignore);
213 // [2] Take the smallest offset, that's the earliest, that is needed
214 // earliest_offset: 0x01
215 228 ScalarType earliest_offset = vminvq(v_selected_offsets);
216 // In the rare case the vector_path did not run, no selected lane
217 // so the scalar_min is the global min (e.g. when src[0] is the min)
218
2/2
✓ Branch 0 taken 212 times.
✓ Branch 1 taken 16 times.
228 if (earliest_offset == 0) {
219 16 return min_scalar_index_;
220 }
221
222 // [3] Calculate the lane for the first one
223 // lane: 0 (smaller than 4)
224 212 size_t lane = lane_of_first_smallest(v_selected_offsets, earliest_offset);
225 636 size_t min_vector_index = min_block_index_ +
226 424 (earliest_offset - 1) * VecTraits::num_lanes() +
227 212 lane;
228 // If the minimum value is not less, then they are equal, so the earlier one
229 // is the winner
230
4/4
✓ Branch 0 taken 56 times.
✓ Branch 1 taken 156 times.
✓ Branch 2 taken 52 times.
✓ Branch 3 taken 4 times.
212 return (min_vector_ < min_scalar_ || min_vector_index < min_scalar_index_
231 208 ? min_vector_index
232 4 : min_scalar_index_);
233 344 }
234
235 // Just like min, but for max
236 344 size_t max_index() const {
237
2/2
✓ Branch 0 taken 131 times.
✓ Branch 1 taken 213 times.
344 if (max_scalar_ > max_vector_) {
238 131 return max_scalar_index_;
239 }
240
241 426 const uint8x16_t v_ignore = vdupq_n(static_cast<ScalarType>(
242 213 std::numeric_limits<std::make_unsigned_t<ScalarType>>::max()));
243 213 VectorType is_max = vcgeq(vmax_, vmax_new_);
244 426 UnsignedVectorType v_selected_offsets =
245 213 vbslq_u8(is_max, vmax_offsets_, v_ignore);
246 213 ScalarType earliest_offset = vminvq(v_selected_offsets);
247
2/2
✓ Branch 0 taken 181 times.
✓ Branch 1 taken 32 times.
213 if (earliest_offset == 0) {
248 32 return max_scalar_index_;
249 }
250
251 181 size_t lane = lane_of_first_smallest(v_selected_offsets, earliest_offset);
252 543 size_t max_vector_index = max_block_index_ +
253 362 (earliest_offset - 1) * VecTraits::num_lanes() +
254 181 lane;
255
4/4
✓ Branch 0 taken 47 times.
✓ Branch 1 taken 134 times.
✓ Branch 2 taken 43 times.
✓ Branch 3 taken 4 times.
181 return (max_vector_ > max_scalar_ || max_vector_index < max_scalar_index_
256 177 ? max_vector_index
257 4 : max_scalar_index_);
258 344 }
259
260 private:
261 393 inline static size_t lane_of_first_smallest(UnsignedVectorType v_values,
262 ScalarType smallest) {
263 // Flag all values that equal to the smallest one
264 // example values: [ 0x01, 0xFF, 0xFF, 0xFF, 0x01, 0x03 ]
265 // v_smallest: [ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 ]
266 393 UnsignedVectorType v_smallest = vdupq_n(smallest);
267 // v_flags: [ 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00 ]
268 393 UnsignedVectorType v_flags = vcleq(v_values, v_smallest);
269 // Find the leftmost of them all
270 393 uint64_t halfvector = vgetq_lane_u64(vreinterpretq_u64(v_flags), 0);
271 393 size_t lane = !halfvector * (sizeof(uint64_t) / sizeof(ScalarType));
272 // If the left half is 0, it must be on the right side, take that 64bit
273
2/2
✓ Branch 0 taken 354 times.
✓ Branch 1 taken 39 times.
393 if (lane) {
274 39 halfvector = vgetq_lane_u64(vreinterpretq_u64(v_flags), 1);
275 39 }
276
277 // Counting the trailing zeroes will tell us where is the leftmost element
278 // (Little Endian) ---> example returns 0, this is the leftmost byte
279 393 lane += __builtin_ctzll(halfvector) / (CHAR_BIT * sizeof(ScalarType));
280 786 return lane;
281 393 }
282
283 // Either the actual min/max in the current block or so far global min/max,
284 // per lane.
285 VectorType vmin_new_, vmax_new_;
286 // Offsets of the vectors where the last min/max values were seen, per lane.
287 // It contains values from 0 (no new) to max_vectors_per_block().
288 UnsignedVectorType vmin_offsets_new_, vmax_offsets_new_;
289 // Actual offset of the vector on the vector path, copied to each lane.
290 // It contains values from 1 to max_vectors_per_block()
291 UnsignedVectorType vrunning_offset_;
292 // Global min/max per lane.
293 VectorType vmin_, vmax_;
294 // Offsets per lane where the first global min/max value was seen.
295 UnsignedVectorType vmin_offsets_, vmax_offsets_;
296 // Index of the current vector block or scalar element.
297 uint64_t running_index_;
298 // Index of the current vector block where the first global min/max value was
299 // seen.
300 uint64_t min_block_index_, max_block_index_;
301 // Index of global scalar min and max elements.
302 uint64_t min_scalar_index_, max_scalar_index_;
303 // Global min/max on vector path.
304 ScalarType min_vector_, max_vector_;
305 // Global min/max on scalar path.
306 ScalarType min_scalar_, max_scalar_;
307 }; // end of class MinMaxLoc<T>
308
309 template <typename ScalarType>
310 728 kleidicv_error_t min_max_loc(const ScalarType *src, size_t src_stride,
311 size_t width, size_t height, size_t *min_offset,
312 size_t *max_offset) {
313
4/4
✓ Branch 0 taken 120 times.
✓ Branch 1 taken 608 times.
✓ Branch 2 taken 120 times.
✓ Branch 3 taken 608 times.
728 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
314
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 604 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 600 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 600 times.
608 CHECK_IMAGE_SIZE(width, height);
315
316
4/4
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 596 times.
✓ Branch 2 taken 8 times.
✓ Branch 3 taken 592 times.
600 if (KLEIDICV_UNLIKELY(width == 0 || height == 0)) {
317 8 return KLEIDICV_ERROR_RANGE;
318 }
319
320 592 Rectangle rect{width, height};
321 592 Rows<const ScalarType> src_rows{src, src_stride};
322 592 MinMaxLoc<ScalarType> operation;
323 592 apply_block_operation_by_rows(operation, rect, src_rows);
324
2/2
✓ Branch 0 taken 248 times.
✓ Branch 1 taken 344 times.
592 if (min_offset) {
325 344 *min_offset = src_rows.offset_for_index(operation.min_index(), width);
326 344 }
327
328
2/2
✓ Branch 0 taken 248 times.
✓ Branch 1 taken 344 times.
592 if (max_offset) {
329 344 *max_offset = src_rows.offset_for_index(operation.max_index(), width);
330 344 }
331 592 return KLEIDICV_OK;
332 728 }
333
334 #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \
335 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t min_max_loc<type>( \
336 const type *src, size_t src_stride, size_t width, size_t height, \
337 size_t *min_offset, size_t *max_offset)
338
339 KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t);
340
341 } // namespace kleidicv::neon
342