KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/float_conv_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 123 123 100.0%
Functions: 12 12 100.0%
Branches: 80 80 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cmath>
6
7 #include "kleidicv/conversions/float_conversion.h"
8 #include "kleidicv/neon.h"
9
10 namespace kleidicv::neon {
11
12 template <typename InputType, typename OutputType>
13 class float_conversion_operation;
14
15 template <typename OutputType>
16 class float_conversion_operation<float, OutputType> {
17 public:
18 using SrcVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<float>;
19 using SrcVectorType = typename SrcVecTraits::VectorType;
20 using IntermediateVecTraits = KLEIDICV_TARGET_NAMESPACE::VecTraits<
21 std::conditional_t<std::is_signed_v<OutputType>, int32_t, uint32_t>>;
22 using IntermediateVectorType = typename IntermediateVecTraits::VectorType;
23
24 256 void process_row(size_t width, Columns<const float> src,
25 Columns<OutputType> dst) {
26 512 LoopUnroll{width, SrcVecTraits::num_lanes()}
27 2158 .unroll_twice([&](size_t step) {
28 1902 SrcVectorType src_vector1 = vld1q_f32(&src[0]);
29 3804 SrcVectorType src_vector2 =
30 1902 vld1q_f32(&src[SrcVecTraits::num_lanes()]);
31 3804 IntermediateVectorType result_vector1 =
32 1902 vector_path<OutputType>(src_vector1);
33 3804 IntermediateVectorType result_vector2 =
34 1902 vector_path<OutputType>(src_vector2);
35 3804 vst1(&dst[0], vqmovn(vcombine(vqmovn(result_vector1),
36 1902 vqmovn(result_vector2))));
37 1902 src += ptrdiff_t(step);
38 1902 dst += ptrdiff_t(step);
39 1902 })
40 408 .remaining([&](size_t length, size_t) {
41
4/4
✓ Branch 0 taken 76 times.
✓ Branch 1 taken 315 times.
✓ Branch 2 taken 76 times.
✓ Branch 3 taken 315 times.
782 for (size_t index = 0; index < length; ++index) {
42 630 disable_loop_vectorization();
43 630 float f = std::nearbyint(src[ptrdiff_t(index)]);
44
4/4
✓ Branch 0 taken 31 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 31 times.
✓ Branch 3 taken 284 times.
630 if (f > std::numeric_limits<OutputType>::max()) {
45 62 f = std::numeric_limits<OutputType>::max();
46
4/4
✓ Branch 0 taken 153 times.
✓ Branch 1 taken 131 times.
✓ Branch 2 taken 151 times.
✓ Branch 3 taken 133 times.
630 } else if (f < std::numeric_limits<OutputType>::lowest()) {
47 264 f = std::numeric_limits<OutputType>::lowest();
48 264 }
49 630 dst[index] = static_cast<OutputType>(f);
50 630 }
51 152 });
52 256 }
53
54 private:
55 template <
56 typename O,
57 std::enable_if_t<std::is_integral_v<O> && std::is_signed_v<O>, int> = 0>
58 1902 IntermediateVectorType vector_path(SrcVectorType src) {
59 1902 IntermediateVectorType result = vcvtnq_s32_f32(src);
60 3804 return result;
61 1902 }
62
63 template <
64 typename O,
65 std::enable_if_t<std::is_integral_v<O> && !std::is_signed_v<O>, int> = 0>
66 1902 IntermediateVectorType vector_path(SrcVectorType src) {
67 1902 IntermediateVectorType result = vcvtnq_u32_f32(src);
68 3804 return result;
69 1902 }
70 }; // end of class float_conversion_operation<float, OutputType>
71
72 template <typename InputType, typename OutputType>
73 160 kleidicv_error_t float_conversion(const InputType* src, size_t src_stride,
74 OutputType* dst, size_t dst_stride,
75 size_t width, size_t height) {
76
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 79 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 79 times.
160 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
77
8/8
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
✓ Branch 4 taken 1 times.
✓ Branch 5 taken 78 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 78 times.
158 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
78
12/12
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
✓ Branch 6 taken 1 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 75 times.
✓ Branch 10 taken 3 times.
✓ Branch 11 taken 75 times.
156 CHECK_IMAGE_SIZE(width, height);
79
80 150 float_conversion_operation<InputType, OutputType> operation;
81 150 Rectangle rect{width, height};
82 150 Rows<const InputType> src_rows{src, src_stride};
83 150 Rows<OutputType> dst_rows{dst, dst_stride};
84 150 zip_rows(operation, rect, src_rows, dst_rows);
85
86 150 return KLEIDICV_OK;
87 160 }
88
89 80 kleidicv_error_t f32_to_s8(const float* src, size_t src_stride, int8_t* dst,
90 size_t dst_stride, size_t width, size_t height) {
91 80 return float_conversion(src, src_stride, dst, dst_stride, width, height);
92 }
93
94 80 kleidicv_error_t f32_to_u8(const float* src, size_t src_stride, uint8_t* dst,
95 size_t dst_stride, size_t width, size_t height) {
96 80 return float_conversion(src, src_stride, dst, dst_stride, width, height);
97 }
98
99 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
100 80 s8_to_f32(const int8_t* src, size_t src_stride, float* dst, size_t dst_stride,
101 size_t width, size_t height) {
102
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
80 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
103
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
79 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
104
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
78 CHECK_IMAGE_SIZE(width, height);
105
106 // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a
107 // single instruction.
108 150 const uint8x16_t index0 = vcombine_u8(vcreate_u8(0x01ffffff00ffffffULL),
109 75 vcreate_u8(0x03ffffff02ffffffULL));
110 150 const uint8x16_t index1 = vcombine_u8(vcreate_u8(0x05ffffff04ffffffULL),
111 75 vcreate_u8(0x07ffffff06ffffffULL));
112 150 const uint8x16_t index2 = vcombine_u8(vcreate_u8(0x09ffffff08ffffffULL),
113 75 vcreate_u8(0x0bffffff0affffffULL));
114 150 const uint8x16_t index3 = vcombine_u8(vcreate_u8(0x0dffffff0cffffffULL),
115 75 vcreate_u8(0x0fffffff0effffffULL));
116
2/2
✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
1570 for (size_t y = 0; y != height; ++y) {
117 1495 size_t x = 0;
118
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
1651 for (; x + 16 <= width; x += 16) {
119 156 int8x16_t input = vld1q(src + x);
120 // Widen from 8-bit to 32-bit and shift right 24 bits instead of
121 // sign-extending.
122 156 int32x4_t a = vreinterpretq_s32_s8(vqtbl1q_s8(input, index0));
123 156 int32x4_t b = vreinterpretq_s32_s8(vqtbl1q_s8(input, index1));
124 156 int32x4_t c = vreinterpretq_s32_s8(vqtbl1q_s8(input, index2));
125 156 int32x4_t d = vreinterpretq_s32_s8(vqtbl1q_s8(input, index3));
126 // Convert to float and divide by 2^24.
127
128 156 float32x4x4_t output = {
129 624 vcvtq_n_f32_s32(a, 24),
130 156 vcvtq_n_f32_s32(b, 24),
131 156 vcvtq_n_f32_s32(c, 24),
132 156 vcvtq_n_f32_s32(d, 24),
133 };
134 156 neon::VecTraits<float>::store(output, dst + x);
135 156 }
136
2/2
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
6895 for (; x != width; ++x) {
137 5400 disable_loop_vectorization();
138 5400 dst[x] = src[x];
139 5400 }
140
141 1495 src += static_cast<ptrdiff_t>(src_stride / sizeof(*src));
142 1495 dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst));
143 1495 }
144 75 return KLEIDICV_OK;
145 80 }
146
147 KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t
148 80 u8_to_f32(const uint8_t* src, size_t src_stride, float* dst, size_t dst_stride,
149 size_t width, size_t height) {
150
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 79 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 79 times.
80 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
151
4/4
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 78 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 78 times.
79 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
152
6/6
✓ Branch 0 taken 1 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 75 times.
✓ Branch 4 taken 3 times.
✓ Branch 5 taken 75 times.
78 CHECK_IMAGE_SIZE(width, height);
153
154 // Indices used with the TBL instruction to widen from 8-bit to 32-bit in a
155 // single instruction.
156 150 const uint8x16_t index0 = vcombine_u8(vcreate_u8(0xffffff01ffffff00ULL),
157 75 vcreate_u8(0xffffff03ffffff02ULL));
158 150 const uint8x16_t index1 = vcombine_u8(vcreate_u8(0xffffff05ffffff04ULL),
159 75 vcreate_u8(0xffffff07ffffff06ULL));
160 150 const uint8x16_t index2 = vcombine_u8(vcreate_u8(0xffffff09ffffff08ULL),
161 75 vcreate_u8(0xffffff0bffffff0aULL));
162 150 const uint8x16_t index3 = vcombine_u8(vcreate_u8(0xffffff0dffffff0cULL),
163 75 vcreate_u8(0xffffff0fffffff0eULL));
164
2/2
✓ Branch 0 taken 1495 times.
✓ Branch 1 taken 75 times.
1570 for (size_t y = 0; y != height; ++y) {
165 1495 size_t x = 0;
166
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 1495 times.
1651 for (; x + 16 <= width; x += 16) {
167 156 uint8x16_t input = vld1q(src + x);
168 // Widen from 8-bit to 32-bit
169 156 uint32x4_t a = vreinterpretq_u32_u8(vqtbl1q_u8(input, index0));
170 156 uint32x4_t b = vreinterpretq_u32_u8(vqtbl1q_u8(input, index1));
171 156 uint32x4_t c = vreinterpretq_u32_u8(vqtbl1q_u8(input, index2));
172 156 uint32x4_t d = vreinterpretq_u32_u8(vqtbl1q_u8(input, index3));
173
174 156 float32x4x4_t output = {
175 624 vcvtq_f32_u32(a),
176 156 vcvtq_f32_u32(b),
177 156 vcvtq_f32_u32(c),
178 156 vcvtq_f32_u32(d),
179 };
180 156 neon::VecTraits<float>::store(output, dst + x);
181 156 }
182
2/2
✓ Branch 0 taken 5400 times.
✓ Branch 1 taken 1495 times.
6895 for (; x != width; ++x) {
183 5400 disable_loop_vectorization();
184 5400 dst[x] = src[x];
185 5400 }
186
187 1495 src += static_cast<ptrdiff_t>(src_stride / sizeof(*src));
188 1495 dst += static_cast<ptrdiff_t>(dst_stride / sizeof(*dst));
189 1495 }
190 75 return KLEIDICV_OK;
191 80 }
192
193 } // namespace kleidicv::neon
194