KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/conversions/yuv420_to_rgb_sc.h
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 100 100 100.0%
Functions: 16 16 100.0%
Branches: 8 8 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #ifndef KLEIDICV_YUV420_TO_RGB_SC_H
6 #define KLEIDICV_YUV420_TO_RGB_SC_H
7
8 #include "kleidicv/kleidicv.h"
9 #include "kleidicv/sve2.h"
10 #include "yuv420_coefficients.h"
11
12 namespace KLEIDICV_TARGET_NAMESPACE {
13
14 template <bool BGR, bool kAlpha>
15 class YUV420XToRGBxOrBGRx {
16 public:
17 const bool v_first_;
18 2444 explicit YUV420XToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING
19 2444 : v_first_{v_first} {}
20 18096 void yuv420x_to_rgb(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u,
21 svint16_t &v, uint8_t *rgbx_row_0,
22 uint8_t *rgbx_row_1) KLEIDICV_STREAMING {
23 // Both the rounding shift right constant and the -128 value are included.
24 18096 constexpr int32_t kOffset = 1 << (kWeightScale - 1);
25 18096 svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]);
26 36192 svint32_t g_base =
27 18096 svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2]));
28 18096 svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]);
29
30 // Y' = saturating(Ya - 16) and widen to signed 32-bits.
31 18096 svuint8_t y0_m16 = svqsub(y0, static_cast<uint8_t>(16));
32 18096 svuint16_t y0_m16_b = svmovlb(y0_m16); // 'b' means bottom
33 18096 svuint16_t y0_m16_t = svmovlt(y0_m16); // 't' means top
34 18096 svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b));
35 18096 svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b));
36 18096 svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t));
37 18096 svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t));
38
39 18096 svuint8_t y1_m16 = svqsub(y1, static_cast<uint8_t>(16));
40 18096 svuint16_t y1_m16_b = svmovlb(y1_m16);
41 18096 svuint16_t y1_m16_t = svmovlt(y1_m16);
42 18096 svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b));
43 18096 svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b));
44 18096 svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t));
45 18096 svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t));
46
47 // Y = Weight(Y) * Y'
48 18096 y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight);
49 18096 y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight);
50 18096 y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight);
51 18096 y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight);
52
53 18096 y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight);
54 18096 y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight);
55 18096 y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight);
56 18096 y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight);
57
58 // Swap U and V planes for YV12 layout.
59
8/8
✓ Branch 0 taken 3990 times.
✓ Branch 1 taken 534 times.
✓ Branch 2 taken 3990 times.
✓ Branch 3 taken 534 times.
✓ Branch 4 taken 3990 times.
✓ Branch 5 taken 534 times.
✓ Branch 6 taken 3990 times.
✓ Branch 7 taken 534 times.
18096 if (v_first_) {
60 2136 swap_scalable(u, v);
61 2136 }
62
63 18096 svint32_t u_b = svmovlb(u);
64 18096 svint32_t u_t = svmovlt(u);
65 18096 svint32_t v_b = svmovlb(v);
66 18096 svint32_t v_t = svmovlt(v);
67
68 // R - Y = Rbase + Weight(RV) * V =
69 // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V
70 18096 svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]);
71 18096 svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]);
72
73 // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V =
74 // Weight(GU) * ((1 << (SCALE - 1)) - 128) +
75 // Weight(GV) * ((1 << (SCALE - 1)) - 128) +
76 // Weight(GU) * U + Weight(GV) * V
77 18096 svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]);
78 18096 svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]);
79 18096 g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]);
80 18096 g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]);
81
82 // B - Y = Bbase + Weight(BU) * U =
83 // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U
84 18096 svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]);
85 18096 svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]);
86
87 // R = (R - Y) + Y
88 // FIXME: There are too many instructions here.
89 // Is there a better way to do this?
90 18096 svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb);
91 18096 r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt);
92 18096 r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16);
93 18096 svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb);
94 18096 r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt);
95 18096 r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16);
96 18096 svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t);
97
98 18096 svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb);
99 18096 r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt);
100 18096 r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16);
101 18096 svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb);
102 18096 r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt);
103 18096 r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16);
104 18096 svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t);
105
106 // G = (G - Y) + Y
107 18096 svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb);
108 18096 g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt);
109 18096 g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16);
110 18096 svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb);
111 18096 g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt);
112 18096 g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16);
113 18096 svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t);
114
115 18096 svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb);
116 18096 g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt);
117 18096 g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16);
118 18096 svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb);
119 18096 g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt);
120 18096 g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16);
121 18096 svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t);
122
123 // B = (B - Y) + Y
124 18096 svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb);
125 18096 b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt);
126 18096 b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16);
127 18096 svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb);
128 18096 b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt);
129 18096 b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16);
130 18096 svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t);
131
132 18096 svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb);
133 18096 b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt);
134 18096 b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16);
135 18096 svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb);
136 18096 b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt);
137 18096 b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16);
138 18096 svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t);
139
140 if constexpr (kAlpha) {
141 18096 svuint8x4_t rgba0 =
142 9048 svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF));
143 18096 svuint8x4_t rgba1 =
144 9048 svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF));
145 // Store RGBA pixels to memory.
146 9048 svst4_u8(pg, rgbx_row_0, rgba0);
147 9048 svst4_u8(pg, rgbx_row_1, rgba1);
148 9048 } else {
149 9048 svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0);
150 9048 svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1);
151 // Store RGB pixels to memory.
152 9048 svst3(pg, rgbx_row_0, rgb0);
153 9048 svst3(pg, rgbx_row_1, rgb1);
154 9048 }
155 18096 }
156 };
157 } // namespace KLEIDICV_TARGET_NAMESPACE
158
159 #endif // KLEIDICV_YUV420_TO_RGB_SC_H
160