KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/analysis/min_max_loc_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 123 123 100.0%
Functions: 9 9 100.0%
Branches: 44 44 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2023 - 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <climits>
6 #include <limits>
7
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/neon.h"
10
11 // This algorithm calculates the index of element in src array of the global
12 // minimum and maximum values.
13 //
14 // Theory of operation
15 //
16 // At most max_vectors_per_block() number of consecutive vector registers are
17 // processed with the vector path before performing some additional
18 // calculations. In every vector path iteration the algorithm remembers
19 // the following data per lane for the currently processed block:
20 // - [1] offsets of the vector where the last min/max value was seen, and
21 // - [2] overall min/max values.
22 //
23 // Block index
24 //
25 // Since our offsets only contain an element-size value, we need to
26 // extend this offset with a base value, to form a full index.
27 // This base value is the block's starting index:
28 //
29 // final index = block_index + offset * num_lanes_in_vector + lane_number
30 //
31 // Once a block is completed, min/max_block_indices are updated, and the
32 // per-lane offsets and min/max values are also copied to the global vectors,
33 // whichever lane has been updated.
34 //
35 // Which lane has been updated?
36 //
37 // Offsets are relative to block index, and are numbered from 1 to
38 // max_vectors_per_block(). At start of a block, offsets are zeroed,
39 // so after the block a non-zero check is enough to flag if an offset
40 // (and the min/max value) is updated.
41 //
42 // How is the update done?
43 //
44 // From the offset lanes, a single flag is made: are there any updates.
45 // If yes, then the min/max_block_index is copied from running_index
46 // (this is done effectively by a CSEL instruction).
47 // Min/max values and relative lane offsets are copied by counting a
48 // no_update vector lane-by-lane (comparing each lane to zero), so only the
49 // changed lanes are copied to the global min/max and offset vector. This is
50 // important to copy only the values that belong to the actual block index
51 // and are smaller/bigger than last global min/max.
52 // Otherwise we could get false results, see this example:
53 // vec: [ 4, 4, 4, 4] --> base_index: 0 offsets: 1 1 1 1
54 // vmax: 4, 4, 4, 4
55 // vec: [ 1,10, 2, 3] --> base_index: 1 new_offsets: 0 3 0 0
56 // vmax: 4,10, 4, 4 max_base_index: 1 --> offsets: 1 3 1 1
57 // vec: [10, 1, 1, 1] -->
58 // vmax:10,10, 4, 4 --- but we did not update the base index + offset!
59 // we would return base_index 1 offset 1, which is completely wrong
60 //
61 // Min/max values updated
62 //
63 // For all the above to work, we have to make sure we don't overwrite the
64 // global min/max values and offsets with an updated value of a non global
65 // min/max. To ensure this, after each block global min/max are calculated
66 // horizontally across from min/max per lane values, and these new global
67 // min/max values copied to all lanes and are used to compare values with
68 // in the next block.
69 //
70 // Example of 'min' for hypothetical 4 lanes and uint8_t type and
71 // max_block_size() = 2.
72 //
73 // Initial state:
74 // vrunning_offset_: [ 0x01, 0x01, 0x01, 0x01 ]
75 // vmin_offsets_: [ 0x00, 0x00, 0x00, 0x00 ]
76 // vmin_offsets_new_: [ 0x00, 0x00, 0x00, 0x00 ]
77 // vmin_new_: [ 0xFF, 0xFF, 0xFF, 0xFF ]
78 // vmin_: [ 0xFF, 0xFF, 0xFF, 0xFF ]
79 //
80 // 1st block:
81 // -> vector_path()
82 // vec: [ 0x12, 0xFF, 0x20, 0x42 ]
83 // is_smaller: [ 0xFF, 0x00, 0xFF, 0xFF ]
84 // vmin_offsets_new_: [ 0x01, 0x00, 0x01, 0x01 ]
85 // vmin_new_: [ 0x12, 0xFF, 0x20, 0x42 ]
86 // vrunning_offset_: [ 0x02, 0x02, 0x02, 0x02 ]
87 // -> vector_path()
88 // vec: [ 0x22, 0xFF, 0x55, 0x23 ]
89 // is_smaller: [ 0x00, 0x00, 0x00, 0xFF ]
90 // vmin_offsets_new_: [ 0x01, 0x00, 0x01, 0x02 ]
91 // vmin_new_: [ 0x12, 0xFF, 0x00, 0x23 ]
92 // vrunning_offset_: [ 0x03, 0x03, 0x03, 0x03 ]
93 // -> on_block_finished(1)
94 // vno_update: [ 0x00, 0xFF, 0x00, 0x00 ]
95 // vmin_offsets_: [ 0x01, 0x00, 0x01, 0x02 ]
96 // vmin_: [ 0x12, 0xFF, 0x20, 0x23 ]
97 // vmin_new_: [ 0x12, 0x12, 0x12, 0x12 ]
98
99 namespace kleidicv::neon {
100
101 template <typename ScalarType>
102 class MinMaxLoc final : public UnrollTwice {
103 public:
104 using VecTraits = neon::VecTraits<ScalarType>;
105 using VectorType = typename VecTraits::VectorType;
106 using UnsignedVectorType =
107 typename neon::VecTraits<std::make_unsigned_t<ScalarType>>::VectorType;
108
109 // NOLINTBEGIN(cppcoreguidelines-prefer-member-initializer)
110 444 MinMaxLoc() {
111 444 vmin_offsets_ = vmax_offsets_ = vmin_offsets_new_ = vmax_offsets_new_ =
112 444 vdupq_n(ScalarType{0});
113 444 vrunning_offset_ = vdupq_n(ScalarType{1});
114 444 vmin_ = vmin_new_ = vdupq_n(std::numeric_limits<ScalarType>::max());
115 444 vmax_ = vmax_new_ = vdupq_n(std::numeric_limits<ScalarType>::lowest());
116 444 min_block_index_ = max_block_index_ = 0;
117 444 min_vector_ = min_scalar_ = std::numeric_limits<ScalarType>::max();
118 444 max_vector_ = max_scalar_ = std::numeric_limits<ScalarType>::lowest();
119 444 min_scalar_index_ = max_scalar_index_ = 0;
120 444 running_index_ = 0;
121 444 }
122 // NOLINTEND(cppcoreguidelines-prefer-member-initializer)
123
124 3624 void vector_path(VectorType src) {
125 3624 VectorType v_is_smaller = vcltq_u8(src, vmin_new_);
126 3624 vmin_offsets_new_ =
127 3624 vbslq_u8(v_is_smaller, vrunning_offset_, vmin_offsets_new_);
128 3624 vmin_new_ = vminq_u8(vmin_new_, src);
129
130 3624 VectorType v_is_bigger = vcgtq_u8(src, vmax_new_);
131 3624 vmax_offsets_new_ =
132 3624 vbslq_u8(v_is_bigger, vrunning_offset_, vmax_offsets_new_);
133 3624 vmax_new_ = vmaxq_u8(vmax_new_, src);
134
135 3624 vrunning_offset_ = vaddq_u8(vrunning_offset_, vdupq_n(ScalarType{1}));
136 3624 }
137
138 13380 void scalar_path(ScalarType src) {
139
2/2
✓ Branch 0 taken 12783 times.
✓ Branch 1 taken 597 times.
13380 if (src < min_scalar_) {
140 597 min_scalar_ = src;
141 597 min_scalar_index_ = running_index_;
142 597 }
143
144
2/2
✓ Branch 0 taken 12692 times.
✓ Branch 1 taken 688 times.
13380 if (src > max_scalar_) {
145 688 max_scalar_ = src;
146 688 max_scalar_index_ = running_index_;
147 688 }
148
149 13380 ++running_index_;
150 13380 }
151
152 549 void on_block_finished(size_t vectors_in_block) {
153 549 min_vector_ = vminvq_u8(vmin_new_);
154 // Does any lane have a new min? Then update the block number
155 // 32-bit horizontal max is the fastest way to do a logical OR
156
2/2
✓ Branch 0 taken 373 times.
✓ Branch 1 taken 176 times.
549 min_block_index_ = vmaxvq_u32(vreinterpretq_u32_u8(vmin_offsets_new_))
157 373 ? running_index_
158 176 : min_block_index_;
159 // Which lane is zero, that was not updated in last block
160 549 uint8x16_t vno_update = vceqzq_u8(vmin_offsets_new_);
161 // Copy the updated (new min!) offsets into the ultimate offset vector
162 549 vmin_offsets_ = vbslq_u8(vno_update, vmin_offsets_, vmin_offsets_new_);
163 // Copy the updated new min values into the ultimate min vector
164 549 vmin_ = vbslq_u8(vno_update, vmin_, vmin_new_);
165 // Copy new min to all lanes, to prevent overwriting it with inferior values
166 549 vmin_new_ = vdupq_n(min_vector_);
167
168 // Just like min, but for max
169 549 max_vector_ = vmaxvq_u8(vmax_new_);
170
2/2
✓ Branch 0 taken 385 times.
✓ Branch 1 taken 164 times.
549 max_block_index_ = vmaxvq_u32(vreinterpretq_u32_u8(vmax_offsets_new_))
171 385 ? running_index_
172 164 : max_block_index_;
173 549 vno_update = vceqzq_u8(vmax_offsets_new_);
174 549 vmax_offsets_ = vbslq_u8(vno_update, vmax_offsets_, vmax_offsets_new_);
175 549 vmax_ = vbslq_u8(vno_update, vmax_, vmax_new_);
176 549 vmax_new_ = vdupq_n(max_vector_);
177
178 549 running_index_ += vectors_in_block * VecTraits::num_lanes();
179 // Starts from 1, so if an offset is updated, we can tell from it is nonzero
180 549 vrunning_offset_ = vdupq_n(ScalarType{1});
181 // Offsets are zeroed so we'll see when they updated
182 549 vmax_offsets_new_ = vmin_offsets_new_ = vdupq_n(ScalarType{0});
183 549 }
184
185 624 size_t max_vectors_per_block() const {
186 // -1: see description of vrunning_offset_
187 624 return std::numeric_limits<ScalarType>::max() - 1;
188 }
189
190 258 size_t min_index() const {
191 // Which minimum is the smaller one, scalar path or vector path?
192
2/2
✓ Branch 0 taken 91 times.
✓ Branch 1 taken 167 times.
258 if (min_scalar_ < min_vector_) {
193 91 return min_scalar_index_;
194 }
195
196 // To find the minimum location:
197 // 1. Get the lanes that have the same min value as the global min.
198 // 2. From the offsets of these lanes, find the smallest one.
199 // 3. From the smallest ones (it can happen if they were in the same vector)
200 // take the first one (LSB, because little endian)
201 // Example offsets: [ 0x01, 0x00, 0x01, 0x02, 0x01, 0x03 ]
202 // vmin_: [ 0x12, 0xFF, 0x20, 0x23, 0x12, 0x12 ]
203 // Fill not selected offsets with maximum value, they won't be the minimum
204 334 const uint8x16_t v_ignore = vdupq_n(static_cast<ScalarType>(
205 167 std::numeric_limits<std::make_unsigned_t<ScalarType>>::max()));
206 // [1] Test which lanes contain the global minimum value
207 // is_min: [ 0xFF, 0x00, 0x00, 0x00, 0xFF, 0xFF ]
208 167 VectorType is_min = vcleq(vmin_, vmin_new_);
209 // Select those offsets that contain the minimum
210 // selected_offsets: [ 0x01, 0xFF, 0xFF, 0xFF, 0x01, 0x03 ]
211 334 UnsignedVectorType v_selected_offsets =
212 167 vbslq_u8(is_min, vmin_offsets_, v_ignore);
213 // [2] Take the smallest offset, that's the earliest, that is needed
214 // earliest_offset: 0x01
215 167 ScalarType earliest_offset = vminvq(v_selected_offsets);
216 // In the rare case the vector_path did not run, no selected lane
217 // so the scalar_min is the global min (e.g. when src[0] is the min)
218
2/2
✓ Branch 0 taken 153 times.
✓ Branch 1 taken 14 times.
167 if (earliest_offset == 0) {
219 14 return min_scalar_index_;
220 }
221
222 // [3] Calculate the lane for the first one
223 // lane: 0 (smaller than 4)
224 153 size_t lane = lane_of_first_smallest(v_selected_offsets, earliest_offset);
225 459 size_t min_vector_index = min_block_index_ +
226 306 (earliest_offset - 1) * VecTraits::num_lanes() +
227 153 lane;
228 // If the minimum value is not less, then they are equal, so the earlier one
229 // is the winner
230
4/4
✓ Branch 0 taken 44 times.
✓ Branch 1 taken 109 times.
✓ Branch 2 taken 40 times.
✓ Branch 3 taken 4 times.
153 return (min_vector_ < min_scalar_ || min_vector_index < min_scalar_index_
231 149 ? min_vector_index
232 4 : min_scalar_index_);
233 258 }
234
235 // Just like min, but for max
236 258 size_t max_index() const {
237
2/2
✓ Branch 0 taken 105 times.
✓ Branch 1 taken 153 times.
258 if (max_scalar_ > max_vector_) {
238 105 return max_scalar_index_;
239 }
240
241 306 const uint8x16_t v_ignore = vdupq_n(static_cast<ScalarType>(
242 153 std::numeric_limits<std::make_unsigned_t<ScalarType>>::max()));
243 153 VectorType is_max = vcgeq(vmax_, vmax_new_);
244 306 UnsignedVectorType v_selected_offsets =
245 153 vbslq_u8(is_max, vmax_offsets_, v_ignore);
246 153 ScalarType earliest_offset = vminvq(v_selected_offsets);
247
2/2
✓ Branch 0 taken 129 times.
✓ Branch 1 taken 24 times.
153 if (earliest_offset == 0) {
248 24 return max_scalar_index_;
249 }
250
251 129 size_t lane = lane_of_first_smallest(v_selected_offsets, earliest_offset);
252 387 size_t max_vector_index = max_block_index_ +
253 258 (earliest_offset - 1) * VecTraits::num_lanes() +
254 129 lane;
255
4/4
✓ Branch 0 taken 38 times.
✓ Branch 1 taken 91 times.
✓ Branch 2 taken 34 times.
✓ Branch 3 taken 4 times.
129 return (max_vector_ > max_scalar_ || max_vector_index < max_scalar_index_
256 125 ? max_vector_index
257 4 : max_scalar_index_);
258 258 }
259
260 private:
261 282 inline static size_t lane_of_first_smallest(UnsignedVectorType v_values,
262 ScalarType smallest) {
263 // Flag all values that equal to the smallest one
264 // example values: [ 0x01, 0xFF, 0xFF, 0xFF, 0x01, 0x03 ]
265 // v_smallest: [ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 ]
266 282 UnsignedVectorType v_smallest = vdupq_n(smallest);
267 // v_flags: [ 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00 ]
268 282 UnsignedVectorType v_flags = vcleq(v_values, v_smallest);
269 // Find the leftmost of them all
270 282 uint64_t halfvector = vgetq_lane_u64(vreinterpretq_u64(v_flags), 0);
271 282 size_t lane = !halfvector * (sizeof(uint64_t) / sizeof(ScalarType));
272 // If the left half is 0, it must be on the right side, take that 64bit
273
2/2
✓ Branch 0 taken 256 times.
✓ Branch 1 taken 26 times.
282 if (lane) {
274 26 halfvector = vgetq_lane_u64(vreinterpretq_u64(v_flags), 1);
275 26 }
276
277 // Counting the trailing zeroes will tell us where is the leftmost element
278 // (Little Endian) ---> example returns 0, this is the leftmost byte
279 282 lane += __builtin_ctzll(halfvector) / (CHAR_BIT * sizeof(ScalarType));
280 564 return lane;
281 282 }
282
283 // Either the actual min/max in the current block or so far global min/max,
284 // per lane.
285 VectorType vmin_new_, vmax_new_;
286 // Offsets of the vectors where the last min/max values were seen, per lane.
287 // It contains values from 0 (no new) to max_vectors_per_block().
288 UnsignedVectorType vmin_offsets_new_, vmax_offsets_new_;
289 // Actual offset of the vector on the vector path, copied to each lane.
290 // It contains values from 1 to max_vectors_per_block()
291 UnsignedVectorType vrunning_offset_;
292 // Global min/max per lane.
293 VectorType vmin_, vmax_;
294 // Offsets per lane where the first global min/max value was seen.
295 UnsignedVectorType vmin_offsets_, vmax_offsets_;
296 // Index of the current vector block or scalar element.
297 uint64_t running_index_;
298 // Index of the current vector block where the first global min/max value was
299 // seen.
300 uint64_t min_block_index_, max_block_index_;
301 // Index of global scalar min and max elements.
302 uint64_t min_scalar_index_, max_scalar_index_;
303 // Global min/max on vector path.
304 ScalarType min_vector_, max_vector_;
305 // Global min/max on scalar path.
306 ScalarType min_scalar_, max_scalar_;
307 }; // end of class MinMaxLoc<T>
308
309 template <typename ScalarType>
310 546 kleidicv_error_t min_max_loc(const ScalarType *src, size_t src_stride,
311 size_t width, size_t height, size_t *min_offset,
312 size_t *max_offset) {
313
4/4
✓ Branch 0 taken 90 times.
✓ Branch 1 taken 456 times.
✓ Branch 2 taken 90 times.
✓ Branch 3 taken 456 times.
546 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
314
6/6
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 453 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 450 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 450 times.
456 CHECK_IMAGE_SIZE(width, height);
315
316
4/4
✓ Branch 0 taken 3 times.
✓ Branch 1 taken 447 times.
✓ Branch 2 taken 6 times.
✓ Branch 3 taken 444 times.
450 if (KLEIDICV_UNLIKELY(width == 0 || height == 0)) {
317 6 return KLEIDICV_ERROR_RANGE;
318 }
319
320 444 Rectangle rect{width, height};
321 444 Rows<const ScalarType> src_rows{src, src_stride};
322 444 MinMaxLoc<ScalarType> operation;
323 444 apply_block_operation_by_rows(operation, rect, src_rows);
324
2/2
✓ Branch 0 taken 186 times.
✓ Branch 1 taken 258 times.
444 if (min_offset) {
325 258 *min_offset = src_rows.offset_for_index(operation.min_index(), width);
326 258 }
327
328
2/2
✓ Branch 0 taken 186 times.
✓ Branch 1 taken 258 times.
444 if (max_offset) {
329 258 *max_offset = src_rows.offset_for_index(operation.max_index(), width);
330 258 }
331 444 return KLEIDICV_OK;
332 546 }
333
334 #define KLEIDICV_INSTANTIATE_TEMPLATE(type) \
335 template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t min_max_loc<type>( \
336 const type *src, size_t src_stride, size_t width, size_t height, \
337 size_t *min_offset, size_t *max_offset)
338
339 KLEIDICV_INSTANTIATE_TEMPLATE(uint8_t);
340
341 } // namespace kleidicv::neon
342