Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #ifndef KLEIDICV_YUV420_TO_RGB_SC_H | ||
6 | #define KLEIDICV_YUV420_TO_RGB_SC_H | ||
7 | |||
8 | #include "kleidicv/kleidicv.h" | ||
9 | #include "kleidicv/sve2.h" | ||
10 | #include "yuv420_coefficients.h" | ||
11 | |||
12 | namespace KLEIDICV_TARGET_NAMESPACE { | ||
13 | |||
14 | template <bool BGR, bool kAlpha> | ||
15 | class YUV420XToRGBxOrBGRx { | ||
16 | public: | ||
17 | const bool v_first_; | ||
18 | 2444 | explicit YUV420XToRGBxOrBGRx(bool v_first) KLEIDICV_STREAMING | |
19 | 2444 | : v_first_{v_first} {} | |
20 | 18096 | void yuv420x_to_rgb(svbool_t &pg, svuint8_t &y0, svuint8_t &y1, svint16_t &u, | |
21 | svint16_t &v, uint8_t *rgbx_row_0, | ||
22 | uint8_t *rgbx_row_1) KLEIDICV_STREAMING { | ||
23 | // Both the rounding shift right constant and the -128 value are included. | ||
24 | 18096 | constexpr int32_t kOffset = 1 << (kWeightScale - 1); | |
25 | 18096 | svint32_t r_base = svdup_s32(kOffset - 128 * kUVWeights[kRVWeightIndex]); | |
26 | 36192 | svint32_t g_base = | |
27 | 18096 | svdup_s32(kOffset - 128 * (kUVWeights[1] + kUVWeights[2])); | |
28 | 18096 | svint32_t b_base = svdup_s32(kOffset - 128 * kUVWeights[3]); | |
29 | |||
30 | // Y' = saturating(Ya - 16) and widen to signed 32-bits. | ||
31 | 18096 | svuint8_t y0_m16 = svqsub(y0, static_cast<uint8_t>(16)); | |
32 | 18096 | svuint16_t y0_m16_b = svmovlb(y0_m16); // 'b' means bottom | |
33 | 18096 | svuint16_t y0_m16_t = svmovlt(y0_m16); // 't' means top | |
34 | 18096 | svint32_t y0_m16_bb = svreinterpret_s32(svmovlb(y0_m16_b)); | |
35 | 18096 | svint32_t y0_m16_bt = svreinterpret_s32(svmovlt(y0_m16_b)); | |
36 | 18096 | svint32_t y0_m16_tb = svreinterpret_s32(svmovlb(y0_m16_t)); | |
37 | 18096 | svint32_t y0_m16_tt = svreinterpret_s32(svmovlt(y0_m16_t)); | |
38 | |||
39 | 18096 | svuint8_t y1_m16 = svqsub(y1, static_cast<uint8_t>(16)); | |
40 | 18096 | svuint16_t y1_m16_b = svmovlb(y1_m16); | |
41 | 18096 | svuint16_t y1_m16_t = svmovlt(y1_m16); | |
42 | 18096 | svint32_t y1_m16_bb = svreinterpret_s32(svmovlb(y1_m16_b)); | |
43 | 18096 | svint32_t y1_m16_bt = svreinterpret_s32(svmovlt(y1_m16_b)); | |
44 | 18096 | svint32_t y1_m16_tb = svreinterpret_s32(svmovlb(y1_m16_t)); | |
45 | 18096 | svint32_t y1_m16_tt = svreinterpret_s32(svmovlt(y1_m16_t)); | |
46 | |||
47 | // Y = Weight(Y) * Y' | ||
48 | 18096 | y0_m16_bb = svmul_x(pg, y0_m16_bb, kYWeight); | |
49 | 18096 | y0_m16_bt = svmul_x(pg, y0_m16_bt, kYWeight); | |
50 | 18096 | y0_m16_tb = svmul_x(pg, y0_m16_tb, kYWeight); | |
51 | 18096 | y0_m16_tt = svmul_x(pg, y0_m16_tt, kYWeight); | |
52 | |||
53 | 18096 | y1_m16_bb = svmul_x(pg, y1_m16_bb, kYWeight); | |
54 | 18096 | y1_m16_bt = svmul_x(pg, y1_m16_bt, kYWeight); | |
55 | 18096 | y1_m16_tb = svmul_x(pg, y1_m16_tb, kYWeight); | |
56 | 18096 | y1_m16_tt = svmul_x(pg, y1_m16_tt, kYWeight); | |
57 | |||
58 | // Swap U and V planes for YV12 layout. | ||
59 |
8/8✓ Branch 0 taken 3990 times.
✓ Branch 1 taken 534 times.
✓ Branch 2 taken 3990 times.
✓ Branch 3 taken 534 times.
✓ Branch 4 taken 3990 times.
✓ Branch 5 taken 534 times.
✓ Branch 6 taken 3990 times.
✓ Branch 7 taken 534 times.
|
18096 | if (v_first_) { |
60 | 2136 | swap_scalable(u, v); | |
61 | 2136 | } | |
62 | |||
63 | 18096 | svint32_t u_b = svmovlb(u); | |
64 | 18096 | svint32_t u_t = svmovlt(u); | |
65 | 18096 | svint32_t v_b = svmovlb(v); | |
66 | 18096 | svint32_t v_t = svmovlt(v); | |
67 | |||
68 | // R - Y = Rbase + Weight(RV) * V = | ||
69 | // Weight(RV) * ((1 << (SCALE - 1)) - 128) + Weight(RV) * V | ||
70 | 18096 | svint32_t r_sub_y_b = svmla_x(pg, r_base, v_b, kUVWeights[kRVWeightIndex]); | |
71 | 18096 | svint32_t r_sub_y_t = svmla_x(pg, r_base, v_t, kUVWeights[kRVWeightIndex]); | |
72 | |||
73 | // G - Y = Gbase + Weight(GU) * U + Weight(GV) * V = | ||
74 | // Weight(GU) * ((1 << (SCALE - 1)) - 128) + | ||
75 | // Weight(GV) * ((1 << (SCALE - 1)) - 128) + | ||
76 | // Weight(GU) * U + Weight(GV) * V | ||
77 | 18096 | svint32_t g_sub_y_b = svmla_x(pg, g_base, u_b, kUVWeights[kGUWeightIndex]); | |
78 | 18096 | svint32_t g_sub_y_t = svmla_x(pg, g_base, u_t, kUVWeights[kGUWeightIndex]); | |
79 | 18096 | g_sub_y_b = svmla_x(pg, g_sub_y_b, v_b, kUVWeights[kGVWeightIndex]); | |
80 | 18096 | g_sub_y_t = svmla_x(pg, g_sub_y_t, v_t, kUVWeights[kGVWeightIndex]); | |
81 | |||
82 | // B - Y = Bbase + Weight(BU) * U = | ||
83 | // Weight(BU) * ((1 << (SCALE - 1)) - 128) + Weight(BU) * U | ||
84 | 18096 | svint32_t b_sub_y_b = svmla_x(pg, b_base, u_b, kUVWeights[kBUWeightIndex]); | |
85 | 18096 | svint32_t b_sub_y_t = svmla_x(pg, b_base, u_t, kUVWeights[kBUWeightIndex]); | |
86 | |||
87 | // R = (R - Y) + Y | ||
88 | // FIXME: There are too many instructions here. | ||
89 | // Is there a better way to do this? | ||
90 | 18096 | svint16_t r0_b = svaddhnb(r_sub_y_b, y0_m16_bb); | |
91 | 18096 | r0_b = svaddhnt(r0_b, r_sub_y_t, y0_m16_bt); | |
92 | 18096 | r0_b = svsra(svdup_n_s16(0), r0_b, kWeightScale - 16); | |
93 | 18096 | svint16_t r0_t = svaddhnb(r_sub_y_b, y0_m16_tb); | |
94 | 18096 | r0_t = svaddhnt(r0_t, r_sub_y_t, y0_m16_tt); | |
95 | 18096 | r0_t = svsra(svdup_n_s16(0), r0_t, kWeightScale - 16); | |
96 | 18096 | svuint8_t r0 = svqxtunt(svqxtunb(r0_b), r0_t); | |
97 | |||
98 | 18096 | svint16_t r1_b = svaddhnb(r_sub_y_b, y1_m16_bb); | |
99 | 18096 | r1_b = svaddhnt(r1_b, r_sub_y_t, y1_m16_bt); | |
100 | 18096 | r1_b = svsra(svdup_n_s16(0), r1_b, kWeightScale - 16); | |
101 | 18096 | svint16_t r1_t = svaddhnb(r_sub_y_b, y1_m16_tb); | |
102 | 18096 | r1_t = svaddhnt(r1_t, r_sub_y_t, y1_m16_tt); | |
103 | 18096 | r1_t = svsra(svdup_n_s16(0), r1_t, kWeightScale - 16); | |
104 | 18096 | svuint8_t r1 = svqxtunt(svqxtunb(r1_b), r1_t); | |
105 | |||
106 | // G = (G - Y) + Y | ||
107 | 18096 | svint16_t g0_b = svaddhnb(g_sub_y_b, y0_m16_bb); | |
108 | 18096 | g0_b = svaddhnt(g0_b, g_sub_y_t, y0_m16_bt); | |
109 | 18096 | g0_b = svsra(svdup_n_s16(0), g0_b, kWeightScale - 16); | |
110 | 18096 | svint16_t g0_t = svaddhnb(g_sub_y_b, y0_m16_tb); | |
111 | 18096 | g0_t = svaddhnt(g0_t, g_sub_y_t, y0_m16_tt); | |
112 | 18096 | g0_t = svsra(svdup_n_s16(0), g0_t, kWeightScale - 16); | |
113 | 18096 | svuint8_t g0 = svqxtunt(svqxtunb(g0_b), g0_t); | |
114 | |||
115 | 18096 | svint16_t g1_b = svaddhnb(g_sub_y_b, y1_m16_bb); | |
116 | 18096 | g1_b = svaddhnt(g1_b, g_sub_y_t, y1_m16_bt); | |
117 | 18096 | g1_b = svsra(svdup_n_s16(0), g1_b, kWeightScale - 16); | |
118 | 18096 | svint16_t g1_t = svaddhnb(g_sub_y_b, y1_m16_tb); | |
119 | 18096 | g1_t = svaddhnt(g1_t, g_sub_y_t, y1_m16_tt); | |
120 | 18096 | g1_t = svsra(svdup_n_s16(0), g1_t, kWeightScale - 16); | |
121 | 18096 | svuint8_t g1 = svqxtunt(svqxtunb(g1_b), g1_t); | |
122 | |||
123 | // B = (B - Y) + Y | ||
124 | 18096 | svint16_t b0_b = svaddhnb(b_sub_y_b, y0_m16_bb); | |
125 | 18096 | b0_b = svaddhnt(b0_b, b_sub_y_t, y0_m16_bt); | |
126 | 18096 | b0_b = svsra(svdup_n_s16(0), b0_b, kWeightScale - 16); | |
127 | 18096 | svint16_t b0_t = svaddhnb(b_sub_y_b, y0_m16_tb); | |
128 | 18096 | b0_t = svaddhnt(b0_t, b_sub_y_t, y0_m16_tt); | |
129 | 18096 | b0_t = svsra(svdup_n_s16(0), b0_t, kWeightScale - 16); | |
130 | 18096 | svuint8_t b0 = svqxtunt(svqxtunb(b0_b), b0_t); | |
131 | |||
132 | 18096 | svint16_t b1_b = svaddhnb(b_sub_y_b, y1_m16_bb); | |
133 | 18096 | b1_b = svaddhnt(b1_b, b_sub_y_t, y1_m16_bt); | |
134 | 18096 | b1_b = svsra(svdup_n_s16(0), b1_b, kWeightScale - 16); | |
135 | 18096 | svint16_t b1_t = svaddhnb(b_sub_y_b, y1_m16_tb); | |
136 | 18096 | b1_t = svaddhnt(b1_t, b_sub_y_t, y1_m16_tt); | |
137 | 18096 | b1_t = svsra(svdup_n_s16(0), b1_t, kWeightScale - 16); | |
138 | 18096 | svuint8_t b1 = svqxtunt(svqxtunb(b1_b), b1_t); | |
139 | |||
140 | if constexpr (kAlpha) { | ||
141 | 18096 | svuint8x4_t rgba0 = | |
142 | 9048 | svcreate4(BGR ? b0 : r0, g0, BGR ? r0 : b0, svdup_n_u8(0xFF)); | |
143 | 18096 | svuint8x4_t rgba1 = | |
144 | 9048 | svcreate4(BGR ? b1 : r1, g1, BGR ? r1 : b1, svdup_n_u8(0xFF)); | |
145 | // Store RGBA pixels to memory. | ||
146 | 9048 | svst4_u8(pg, rgbx_row_0, rgba0); | |
147 | 9048 | svst4_u8(pg, rgbx_row_1, rgba1); | |
148 | 9048 | } else { | |
149 | 9048 | svuint8x3_t rgb0 = svcreate3(BGR ? b0 : r0, g0, BGR ? r0 : b0); | |
150 | 9048 | svuint8x3_t rgb1 = svcreate3(BGR ? b1 : r1, g1, BGR ? r1 : b1); | |
151 | // Store RGB pixels to memory. | ||
152 | 9048 | svst3(pg, rgbx_row_0, rgb0); | |
153 | 9048 | svst3(pg, rgbx_row_1, rgb1); | |
154 | 9048 | } | |
155 | 18096 | } | |
156 | }; | ||
157 | } // namespace KLEIDICV_TARGET_NAMESPACE | ||
158 | |||
159 | #endif // KLEIDICV_YUV420_TO_RGB_SC_H | ||
160 |