KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv_to_rgb_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 112 112 100.0%
Functions: 24 24 100.0%
Branches: 56 56 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <utility>
6
7 #include "kleidicv/conversions/yuv_to_rgb.h"
8 #include "kleidicv/ctypes.h"
9 #include "kleidicv/kleidicv.h"
10 #include "kleidicv/neon.h"
11
12 namespace kleidicv::neon {
13
14 template <bool BGR, bool kAlpha>
15 class YUVToRGBAll final : public UnrollOnce, public TryToAvoidTailLoop {
16 public:
17 using VecTraits = neon::VecTraits<uint8_t>;
18 using ScalarType = VecTraits::ScalarType;
19 using VectorType = VecTraits::VectorType;
20 using Vector3Type = VecTraits::Vector3Type;
21 using RawDestinationVectorType =
22 typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type;
23
24 356 explicit YUVToRGBAll()
25 356 : b_delta4_(vdupq_n_u32(kBDelta4)),
26 356 g_delta4_(vdupq_n_u32(kGDelta4)),
27 356 r_delta4_(vdupq_n_u32(kRDelta4)) {}
28
29 // Returns the number of channels in the output image.
30 292 static constexpr size_t output_channels() {
31 292 return kAlpha ? /* RGBA */ 4 : /* RGB */ 3;
32 }
33
34 1880 void vector_path(const ScalarType *src, ScalarType *dst) {
35 // Load deinterleaved
36 1880 Vector3Type vsrc = vld3q_u8(src);
37 1880 int16x8_t y_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[0], vdupq_n_u8(0)));
38 1880 int16x8_t y_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[0], vdupq_n_u8(0)));
39 3760 int16x8_t u4_l =
40 1880 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[1]), kPreShift));
41 3760 int16x8_t u4_h =
42 1880 vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[1], kPreShift));
43 3760 int16x8_t v4_l =
44 1880 vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[2]), kPreShift));
45 3760 int16x8_t v4_h =
46 1880 vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[2], kPreShift));
47 1880 uint8x16_t r, g, b;
48
49 // Compute B value in 32-bit precision
50 {
51 // Multiplication is done with uint16_t because UBWeight only fits in
52 // unsigned 16-bit
53 3760 int32x4_t b_ll = vreinterpretq_s32_u32(vmull_n_u16(
54 1880 vget_low_u16(vreinterpretq_u16_s16(u4_l)), kUnsignedUBWeight));
55 3760 int32x4_t b_hl = vreinterpretq_s32_u32(vmull_n_u16(
56 1880 vget_low_u16(vreinterpretq_u16_s16(u4_h)), kUnsignedUBWeight));
57 3760 int32x4_t b_lh = vreinterpretq_s32_u32(
58 1880 vmull_high_n_u16(vreinterpretq_u16_s16(u4_l), kUnsignedUBWeight));
59 3760 int32x4_t b_hh = vreinterpretq_s32_u32(
60 1880 vmull_high_n_u16(vreinterpretq_u16_s16(u4_h), kUnsignedUBWeight));
61
62 1880 b_ll = vaddq(b_ll, b_delta4_);
63 1880 b_hl = vaddq(b_hl, b_delta4_);
64 1880 b_lh = vaddq(b_lh, b_delta4_);
65 1880 b_hh = vaddq(b_hh, b_delta4_);
66
67 3760 int16x8_t b_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(b_ll),
68 1880 vreinterpretq_s16_s32(b_lh)));
69 3760 int16x8_t b_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(b_hl),
70 1880 vreinterpretq_s16_s32(b_hh)));
71
72 1880 b = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
73 1880 }
74
75 // Compute G value in 32-bit precision
76 {
77 1880 int32x4_t g_ll = vmlal_n_s16(g_delta4_, vget_low_s16(u4_l), kUGWeight);
78 1880 int32x4_t g_hl = vmlal_n_s16(g_delta4_, vget_low_s16(u4_h), kUGWeight);
79 1880 int32x4_t g_lh = vmlal_high_n_s16(g_delta4_, u4_l, kUGWeight);
80 1880 int32x4_t g_hh = vmlal_high_n_s16(g_delta4_, u4_h, kUGWeight);
81
82 1880 g_ll = vmlal_n_s16(g_ll, vget_low_s16(v4_l), kVGWeight);
83 1880 g_hl = vmlal_n_s16(g_hl, vget_low_s16(v4_h), kVGWeight);
84 1880 g_lh = vmlal_high_n_s16(g_lh, v4_l, kVGWeight);
85 1880 g_hh = vmlal_high_n_s16(g_hh, v4_h, kVGWeight);
86
87 3760 int16x8_t g_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(g_ll),
88 1880 vreinterpretq_s16_s32(g_lh)));
89 3760 int16x8_t g_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(g_hl),
90 1880 vreinterpretq_s16_s32(g_hh)));
91
92 1880 g = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
93 1880 }
94
95 // Compute R value in 32-bit precision
96 {
97 1880 int32x4_t r_ll = vmlal_n_s16(r_delta4_, vget_low_s16(v4_l), kVRWeight);
98 1880 int32x4_t r_hl = vmlal_n_s16(r_delta4_, vget_low_s16(v4_h), kVRWeight);
99 1880 int32x4_t r_lh = vmlal_high_n_s16(r_delta4_, v4_l, kVRWeight);
100 1880 int32x4_t r_hh = vmlal_high_n_s16(r_delta4_, v4_h, kVRWeight);
101
102 3760 int16x8_t r_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(r_ll),
103 1880 vreinterpretq_s16_s32(r_lh)));
104 3760 int16x8_t r_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(r_hl),
105 1880 vreinterpretq_s16_s32(r_hh)));
106
107 1880 r = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
108 1880 }
109
110 1880 RawDestinationVectorType rgb;
111 1880 rgb.val[r_index_] = r;
112 1880 rgb.val[g_index_] = g;
113 1880 rgb.val[b_index_] = b;
114 if constexpr (kAlpha) {
115 940 rgb.val[alpha_index_] = vdupq_n_u8(alpha_value);
116 // Store interleaved RGBA pixels to memory.
117 940 vst4q_u8(dst, rgb);
118 } else {
119 // Store interleaved RGB pixels to memory.
120 940 vst3q_u8(dst, rgb);
121 }
122 1880 }
123
124 412 void scalar_path(const ScalarType *src, ScalarType *dst) {
125 412 int32_t y = static_cast<int32_t>(src[0]);
126 412 int32_t u = static_cast<int32_t>(src[1]);
127 412 int32_t v = static_cast<int32_t>(src[2]);
128 412 int32_t b = y + rounding_shift_right((u - 128) * kUBWeight, kWeightScale);
129 824 int32_t g =
130 412 y + rounding_shift_right((u - 128) * kUGWeight + (v - 128) * kVGWeight,
131 kWeightScale);
132 412 int32_t r = y + rounding_shift_right((v - 128) * kVRWeight, kWeightScale);
133 412 dst[r_index_] = saturating_cast<int32_t, uint8_t>(r);
134 412 dst[g_index_] = saturating_cast<int32_t, uint8_t>(g);
135 412 dst[b_index_] = saturating_cast<int32_t, uint8_t>(b);
136 if constexpr (kAlpha) {
137 206 dst[alpha_index_] = alpha_value;
138 }
139 412 }
140
141 private:
142 static constexpr size_t r_index_ = BGR ? 2 : 0;
143 static constexpr size_t g_index_ = 1;
144 static constexpr size_t b_index_ = BGR ? 0 : 2;
145 static constexpr size_t alpha_index_ = 3;
146 static constexpr uint8_t alpha_value = std::numeric_limits<uint8_t>::max();
147 int32x4_t b_delta4_, g_delta4_, r_delta4_;
148 }; // end of class YUVToRGBAll<bool BGR>
149
150 template <typename OperationType, typename ScalarType>
151 356 kleidicv_error_t yuv2rgb_operation(OperationType &operation,
152 const ScalarType *src, size_t src_stride,
153 ScalarType *dst, size_t dst_stride,
154 size_t width, size_t height) {
155
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.
356 CHECK_POINTER_AND_STRIDE(src, src_stride, height);
156
16/16
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.
340 CHECK_POINTER_AND_STRIDE(dst, dst_stride, height);
157
24/24
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.
324 CHECK_IMAGE_SIZE(width, height);
158
159 292 Rectangle rect{width, height};
160 292 Rows src_rows{src, src_stride, 3};
161 292 Rows dst_rows{dst, dst_stride, operation.output_channels()};
162
163 292 apply_operation_by_rows(operation, rect, src_rows, dst_rows);
164 292 return KLEIDICV_OK;
165 356 }
166
167 using YUVToRGB = YUVToRGBAll<false, false>;
168 using YUVToRGBA = YUVToRGBAll<false, true>;
169 using YUVToBGR = YUVToRGBAll<true, false>;
170 using YUVToBGRA = YUVToRGBAll<true, true>;
171
172 KLEIDICV_TARGET_FN_ATTRS
173 89 kleidicv_error_t yuv_to_rgb_u8(const uint8_t *src, size_t src_stride,
174 uint8_t *dst, size_t dst_stride, size_t width,
175 size_t height) {
176 89 YUVToRGB operation;
177 267 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width,
178 89 height);
179 89 }
180
181 KLEIDICV_TARGET_FN_ATTRS
182 89 kleidicv_error_t yuv_to_rgba_u8(const uint8_t *src, size_t src_stride,
183 uint8_t *dst, size_t dst_stride, size_t width,
184 size_t height) {
185 89 YUVToRGBA operation;
186 267 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width,
187 89 height);
188 89 }
189
190 KLEIDICV_TARGET_FN_ATTRS
191 89 kleidicv_error_t yuv_to_bgr_u8(const uint8_t *src, size_t src_stride,
192 uint8_t *dst, size_t dst_stride, size_t width,
193 size_t height) {
194 89 YUVToBGR operation;
195 267 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width,
196 89 height);
197 89 }
198
199 KLEIDICV_TARGET_FN_ATTRS
200 89 kleidicv_error_t yuv_to_bgra_u8(const uint8_t *src, size_t src_stride,
201 uint8_t *dst, size_t dst_stride, size_t width,
202 size_t height) {
203 89 YUVToBGRA operation;
204 267 return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width,
205 89 height);
206 89 }
207
208 } // namespace kleidicv::neon
209