KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv444_to_rgb_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 112 112 100.0%
Functions: 21 21 100.0%
Branches: 61 61 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <utility>
6
7 #include "kleidicv/conversions/yuv_to_rgb.h"
8 #include "kleidicv/ctypes.h"
9 #include "kleidicv/kleidicv.h"
10 #include "kleidicv/neon.h"
11 #include "yuv444_coefficients.h"
12
13 namespace kleidicv::neon {
14
15 template <bool BGR, bool kAlpha>
16 class YUVToRGBAll final : public UnrollOnce, public TryToAvoidTailLoop {
17 public:
18 using VecTraits = neon::VecTraits<uint8_t>;
19 using ScalarType = VecTraits::ScalarType;
20 using VectorType = VecTraits::VectorType;
21 using Vector3Type = VecTraits::Vector3Type;
22 using RawDestinationVectorType =
23 typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
24
25 336 explicit YUVToRGBAll()
26 336 : b_delta4_(vdupq_n_u32(kBDelta4)),
27 336 g_delta4_(vdupq_n_u32(kGDelta4)),
28 336 r_delta4_(vdupq_n_u32(kRDelta4)) {}
29
30 // Returns the number of channels in the output image.
31 260 static constexpr size_t output_channels() {
32 260 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
33 }
34
35 KLEIDICV_FORCE_INLINE
36 1864 void vector_path(const ScalarType *src, ScalarType *dst) {
37 // Load deinterleaved
38 1864 Vector3Type vsrc = vld3q_u8(src);
39 1864 int16x8_t y_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[0], vdupq_n_u8(0)));
40 1864 int16x8_t y_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[0], vdupq_n_u8(0)));
41 3728 int16x8_t u4_l =
42 1864 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[1]), kPreShift));
43 3728 int16x8_t u4_h =
44 1864 vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[1], kPreShift));
45 3728 int16x8_t v4_l =
46 1864 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[2]), kPreShift));
47 3728 int16x8_t v4_h =
48 1864 vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[2], kPreShift));
49 1864 uint8x16_t r, g, b;
50
51 // Compute B value in 32-bit precision
52 {
53 // Multiplication is done with uint16_t because UBWeight only fits in
54 // unsigned 16-bit
55 3728 int32x4_t b_ll = vreinterpretq_s32_u32(vmull_n_u16(
56 1864 vget_low_u16(vreinterpretq_u16_s16(u4_l)), kUnsignedUBWeight));
57 3728 int32x4_t b_hl = vreinterpretq_s32_u32(vmull_n_u16(
58 1864 vget_low_u16(vreinterpretq_u16_s16(u4_h)), kUnsignedUBWeight));
59 3728 int32x4_t b_lh = vreinterpretq_s32_u32(
60 1864 vmull_high_n_u16(vreinterpretq_u16_s16(u4_l), kUnsignedUBWeight));
61 3728 int32x4_t b_hh = vreinterpretq_s32_u32(
62 1864 vmull_high_n_u16(vreinterpretq_u16_s16(u4_h), kUnsignedUBWeight));
63
64 1864 b_ll = vaddq(b_ll, b_delta4_);
65 1864 b_hl = vaddq(b_hl, b_delta4_);
66 1864 b_lh = vaddq(b_lh, b_delta4_);
67 1864 b_hh = vaddq(b_hh, b_delta4_);
68
69 3728 int16x8_t b_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(b_ll),
70 1864 vreinterpretq_s16_s32(b_lh)));
71 3728 int16x8_t b_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(b_hl),
72 1864 vreinterpretq_s16_s32(b_hh)));
73
74 1864 b = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
75 1864 }
76
77 // Compute G value in 32-bit precision
78 {
79 1864 int32x4_t g_ll = vmlal_n_s16(g_delta4_, vget_low_s16(u4_l), kUGWeight);
80 1864 int32x4_t g_hl = vmlal_n_s16(g_delta4_, vget_low_s16(u4_h), kUGWeight);
81 1864 int32x4_t g_lh = vmlal_high_n_s16(g_delta4_, u4_l, kUGWeight);
82 1864 int32x4_t g_hh = vmlal_high_n_s16(g_delta4_, u4_h, kUGWeight);
83
84 1864 g_ll = vmlal_n_s16(g_ll, vget_low_s16(v4_l), kVGWeight);
85 1864 g_hl = vmlal_n_s16(g_hl, vget_low_s16(v4_h), kVGWeight);
86 1864 g_lh = vmlal_high_n_s16(g_lh, v4_l, kVGWeight);
87 1864 g_hh = vmlal_high_n_s16(g_hh, v4_h, kVGWeight);
88
89 3728 int16x8_t g_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(g_ll),
90 1864 vreinterpretq_s16_s32(g_lh)));
91 3728 int16x8_t g_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(g_hl),
92 1864 vreinterpretq_s16_s32(g_hh)));
93
94 1864 g = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
95 1864 }
96
97 // Compute R value in 32-bit precision
98 {
99 1864 int32x4_t r_ll = vmlal_n_s16(r_delta4_, vget_low_s16(v4_l), kVRWeight);
100 1864 int32x4_t r_hl = vmlal_n_s16(r_delta4_, vget_low_s16(v4_h), kVRWeight);
101 1864 int32x4_t r_lh = vmlal_high_n_s16(r_delta4_, v4_l, kVRWeight);
102 1864 int32x4_t r_hh = vmlal_high_n_s16(r_delta4_, v4_h, kVRWeight);
103
104 3728 int16x8_t r_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(r_ll),
105 1864 vreinterpretq_s16_s32(r_lh)));
106 3728 int16x8_t r_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(r_hl),
107 1864 vreinterpretq_s16_s32(r_hh)));
108
109 1864 r = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
110 1864 }
111
112 1864 RawDestinationVectorType rgb;
113 1864 rgb.val[r_index_] = r;
114 1864 rgb.val[g_index_] = g;
115 1864 rgb.val[b_index_] = b;
116 if constexpr (kAlpha) {
117 932 rgb.val[alpha_index_] = vdupq_n_u8(alpha_value);
118 // Store interleaved RGBA pixels to memory.
119 932 vst4q_u8(dst, rgb);
120 } else {
121 // Store interleaved RGB pixels to memory.
122 932 vst3q_u8(dst, rgb);
123 }
124 1864 }
125
126 KLEIDICV_FORCE_INLINE
127 492 void scalar_path(const ScalarType *src, ScalarType *dst) {
128 492 int32_t y = static_cast<int32_t>(src[0]);
129 492 int32_t u = static_cast<int32_t>(src[1]);
130 492 int32_t v = static_cast<int32_t>(src[2]);
131 492 int32_t b = y + rounding_shift_right((u - 128) * kUBWeight, kWeightScale);
132 984 int32_t g =
133 492 y + rounding_shift_right((u - 128) * kUGWeight + (v - 128) * kVGWeight,
134 kWeightScale);
135 492 int32_t r = y + rounding_shift_right((v - 128) * kVRWeight, kWeightScale);
136 492 dst[r_index_] = saturating_cast<int32_t, uint8_t>(r);
137 492 dst[g_index_] = saturating_cast<int32_t, uint8_t>(g);
138 492 dst[b_index_] = saturating_cast<int32_t, uint8_t>(b);
139 if constexpr (kAlpha) {
140 246 dst[alpha_index_] = alpha_value;
141 }
142 492 }
143
144 private:
145 static constexpr size_t r_index_ = BGR ? 2 : 0;
146 static constexpr size_t g_index_ = 1;
147 static constexpr size_t b_index_ = BGR ? 0 : 2;
148 static constexpr size_t alpha_index_ = 3;
149 static constexpr uint8_t alpha_value = std::numeric_limits<uint8_t>::max();
150 int32x4_t b_delta4_, g_delta4_, r_delta4_;
151 }; // end of class YUVToRGBAll<bool BGR>
152
153 template <typename OperationType, typename ScalarType>
154 336 KLEIDICV_FORCE_INLINE kleidicv_error_t yuv2rgb_operation(
155 OperationType &operation, const ScalarType *src, size_t src_stride,
156 ScalarType *dst, size_t dst_stride, size_t width, size_t height) {
157
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 80 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 80 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 80 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 80 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 80 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 80 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 80 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 80 times.
336 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
158
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 76 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 76 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 76 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 76 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 76 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 76 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 76 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 76 times.
320 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
159
24/24
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 5 times.
✓ Branch 3 taken 65 times.
✓ Branch 4 taken 11 times.
✓ Branch 5 taken 65 times.
✓ Branch 6 taken 6 times.
✓ Branch 7 taken 70 times.
✓ Branch 8 taken 5 times.
✓ Branch 9 taken 65 times.
✓ Branch 10 taken 11 times.
✓ Branch 11 taken 65 times.
✓ Branch 12 taken 6 times.
✓ Branch 13 taken 70 times.
✓ Branch 14 taken 5 times.
✓ Branch 15 taken 65 times.
✓ Branch 16 taken 11 times.
✓ Branch 17 taken 65 times.
✓ Branch 18 taken 6 times.
✓ Branch 19 taken 70 times.
✓ Branch 20 taken 5 times.
✓ Branch 21 taken 65 times.
✓ Branch 22 taken 11 times.
✓ Branch 23 taken 65 times.
304 CHECK_IMAGE_SIZE(width, height);
160
161 260 Rectangle rect{width, height};
162 260 Rows src_rows{src, src_stride, 3};
163 260 Rows dst_rows{dst, dst_stride, operation.output_channels()};
164
165 260 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
166 260 return KLEIDICV_OK;
167 336 }
168
169 using YUVToRGB = YUVToRGBAll<false, false>;
170 using YUVToRGBA = YUVToRGBAll<false, true>;
171 using YUVToBGR = YUVToRGBAll<true, false>;
172 using YUVToBGRA = YUVToRGBAll<true, true>;
173
174 KLEIDICV_TARGET_FN_ATTRS
175 360 kleidicv_error_t yuv444_to_rgb_u8(const uint8_t *src, size_t src_stride,
176 uint8_t *dst, size_t dst_stride, size_t width,
177 size_t height,
178 kleidicv_color_conversion_t color_format) {
179
5/5
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 24 times.
✓ Branch 2 taken 84 times.
✓ Branch 3 taken 84 times.
✓ Branch 4 taken 84 times.
360 switch (color_format) {
180 case KLEIDICV_YUV444_TO_RGB: {
181 84 YUVToRGB operation;
182 168 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride,
183 84 width, height);
184 84 }
185
186 case KLEIDICV_YUV444_TO_BGR: {
187 84 YUVToBGR operation;
188 168 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride,
189 84 width, height);
190 84 }
191
192 case KLEIDICV_YUV444_TO_RGBA: {
193 84 YUVToRGBA operation;
194 168 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride,
195 84 width, height);
196 84 }
197
198 case KLEIDICV_YUV444_TO_BGRA: {
199 84 YUVToBGRA operation;
200 168 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride,
201 84 width, height);
202 84 }
203
204 default:
205 24 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
206 }
207
208 return KLEIDICV_ERROR_NOT_IMPLEMENTED;
209 360 }
210
211 } // namespace kleidicv::neon
212