KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420_to_rgb_sc.h
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 100 100 100.0%
Functions: 24 24 100.0%
Branches: 8 8 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV420_TO_RGB_SC_H
6 #define KLEIDICV_YUV420_TO_RGB_SC_H
7
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/sve2.h"
10 #include "yuv42x_coefficients.h"
11
12 namespace KLEIDICV_TARGET_NAMESPACE {
13
14 template <bool BGR, bool kAlpha>
15 class YUV420XToRGBxOrBGRx {
16 public:
17 const bool v_first_;
18 4624 explicit YUV420XToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
19 4624 : v_first_{v_first} {}
20
21 KLEIDICV_FORCE_INLINE
22 43040 void yuv420x_to_rgb(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
23 svint16_t &v, uint8_t *rgbx_row_0,
24 uint8_t *rgbx_row_1) const KLEIDICV_STREAMING {
25 // Both the rounding shift right constant and the -128 value are included.
26 43040 constexpr int32_t kOffset = 1 << (kWeightScale - 1);
27 43040 svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]);
28 86080 svint32_t g_base =
29 43040 svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2]));
30 43040 svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]);
31
32 // Y' = saturating(Ya - 16) and widen to signed 32-bits.
33 43040 svuint8_t y0_m16 = svqsub(y0, static_cast<uint8_t>(16));
34 43040 svuint16_t y0_m16_b = svmovlb(y0_m16); // 'b' means bottom
35 43040 svuint16_t y0_m16_t = svmovlt(y0_m16); // 't' means top
36 43040 svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b));
37 43040 svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b));
38 43040 svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t));
39 43040 svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t));
40
41 43040 svuint8_t y1_m16 = svqsub(y1, static_cast<uint8_t>(16));
42 43040 svuint16_t y1_m16_b = svmovlb(y1_m16);
43 43040 svuint16_t y1_m16_t = svmovlt(y1_m16);
44 43040 svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b));
45 43040 svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b));
46 43040 svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t));
47 43040 svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t));
48
49 // Y = Weight(Y) * Y'
50 43040 y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight);
51 43040 y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight);
52 43040 y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight);
53 43040 y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight);
54
55 43040 y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight);
56 43040 y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight);
57 43040 y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight);
58 43040 y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight);
59
60 // Swap U and V planes for YV12 layout.
61
8/8
✓ Branch 0 taken 5380 times.
✓ Branch 1 taken 5380 times.
✓ Branch 2 taken 5380 times.
✓ Branch 3 taken 5380 times.
✓ Branch 4 taken 5380 times.
✓ Branch 5 taken 5380 times.
✓ Branch 6 taken 5380 times.
✓ Branch 7 taken 5380 times.
43040 if (v_first_) {
62 21520 swap_scalable(u, v);
63 21520 }
64
65 43040 svint32_t u_b = svmovlb(u);
66 43040 svint32_t u_t = svmovlt(u);
67 43040 svint32_t v_b = svmovlb(v);
68 43040 svint32_t v_t = svmovlt(v);
69
70 // R - Y = Rbase + Weight(RV) * V =
71 // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
72 43040 svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]);
73 43040 svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]);
74
75 // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
76 // Weight(GU) * ((1 << (SCALE - 1)) - 128) +
77 // Weight(GV) * ((1 << (SCALE - 1)) - 128) +
78 // Weight(GU) * U + Weight(GV) * V
79 43040 svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]);
80 43040 svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]);
81 43040 g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]);
82 43040 g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]);
83
84 // B - Y = Bbase + Weight(BU) * U =
85 // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
86 43040 svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]);
87 43040 svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]);
88
89 // R = (R - Y) + Y
90 // FIXME: There are too many instructions here.
91 // Is there a better way to do this?
92 43040 svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb);
93 43040 r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt);
94 43040 r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16);
95 43040 svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb);
96 43040 r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt);
97 43040 r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16);
98 43040 svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t);
99
100 43040 svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb);
101 43040 r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt);
102 43040 r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16);
103 43040 svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb);
104 43040 r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt);
105 43040 r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16);
106 43040 svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t);
107
108 // G = (G - Y) + Y
109 43040 svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb);
110 43040 g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt);
111 43040 g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16);
112 43040 svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb);
113 43040 g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt);
114 43040 g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16);
115 43040 svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t);
116
117 43040 svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb);
118 43040 g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt);
119 43040 g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16);
120 43040 svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb);
121 43040 g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt);
122 43040 g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16);
123 43040 svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t);
124
125 // B = (B - Y) + Y
126 43040 svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb);
127 43040 b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt);
128 43040 b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16);
129 43040 svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb);
130 43040 b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt);
131 43040 b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16);
132 43040 svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t);
133
134 43040 svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb);
135 43040 b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt);
136 43040 b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16);
137 43040 svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb);
138 43040 b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt);
139 43040 b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16);
140 43040 svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t);
141
142 if constexpr (kAlpha) {
143 43040 svuint8x4_t rgba0 =
144 21520 svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF));
145 43040 svuint8x4_t rgba1 =
146 21520 svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF));
147 // Store RGBA pixels to memory.
148 21520 svst4_u8(pg, rgbx_row_0, rgba0);
149 21520 svst4_u8(pg, rgbx_row_1, rgba1);
150 21520 } else {
151 21520 svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0);
152 21520 svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1);
153 // Store RGB pixels to memory.
154 21520 svst3(pg, rgbx_row_0, rgb0);
155 21520 svst3(pg, rgbx_row_1, rgb1);
156 21520 }
157 43040 }
158 };
159 } // namespace KLEIDICV_TARGET_NAMESPACE
160
161 #endif // KLEIDICV_YUV420_TO_RGB_SC_H
162