Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <utility> | ||
6 | |||
7 | #include "kleidicv/conversions/yuv_to_rgb.h" | ||
8 | #include "kleidicv/ctypes.h" | ||
9 | #include "kleidicv/kleidicv.h" | ||
10 | #include "kleidicv/neon.h" | ||
11 | |||
12 | namespace kleidicv::neon { | ||
13 | |||
14 | template <bool BGR, bool kAlpha> | ||
15 | class YUVToRGBAll final : public UnrollOnce, public TryToAvoidTailLoop { | ||
16 | public: | ||
17 | using VecTraits = neon::VecTraits<uint8_t>; | ||
18 | using ScalarType = VecTraits::ScalarType; | ||
19 | using VectorType = VecTraits::VectorType; | ||
20 | using Vector3Type = VecTraits::Vector3Type; | ||
21 | using RawDestinationVectorType = | ||
22 | typename std::conditional<kAlpha, uint8x16x4_t, uint8x16x3_t>::type; | ||
23 | |||
24 | 356 | explicit YUVToRGBAll() | |
25 | 356 | : b_delta4_(vdupq_n_u32(kBDelta4)), | |
26 | 356 | g_delta4_(vdupq_n_u32(kGDelta4)), | |
27 | 356 | r_delta4_(vdupq_n_u32(kRDelta4)) {} | |
28 | |||
29 | // Returns the number of channels in the output image. | ||
30 | 292 | static constexpr size_t output_channels() { | |
31 | 292 | return kAlpha ? /* RGBA */ 4 : /* RGB */ 3; | |
32 | } | ||
33 | |||
34 | 1880 | void vector_path(const ScalarType *src, ScalarType *dst) { | |
35 | // Load deinterleaved | ||
36 | 1880 | Vector3Type vsrc = vld3q_u8(src); | |
37 | 1880 | int16x8_t y_l = vreinterpretq_s16_u8(vzip1q_u8(vsrc.val[0], vdupq_n_u8(0))); | |
38 | 1880 | int16x8_t y_h = vreinterpretq_s16_u8(vzip2q_u8(vsrc.val[0], vdupq_n_u8(0))); | |
39 | 3760 | int16x8_t u4_l = | |
40 | 1880 | vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[1]), kPreShift)); | |
41 | 3760 | int16x8_t u4_h = | |
42 | 1880 | vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[1], kPreShift)); | |
43 | 3760 | int16x8_t v4_l = | |
44 | 1880 | vreinterpretq_s16_u16(vshll_n_u8(vget_low_u8(vsrc.val[2]), kPreShift)); | |
45 | 3760 | int16x8_t v4_h = | |
46 | 1880 | vreinterpretq_s16_u16(vshll_high_n_u8(vsrc.val[2], kPreShift)); | |
47 | 1880 | uint8x16_t r, g, b; | |
48 | |||
49 | // Compute B value in 32-bit precision | ||
50 | { | ||
51 | // Multiplication is done with uint16_t because UBWeight only fits in | ||
52 | // unsigned 16-bit | ||
53 | 3760 | int32x4_t b_ll = vreinterpretq_s32_u32(vmull_n_u16( | |
54 | 1880 | vget_low_u16(vreinterpretq_u16_s16(u4_l)), kUnsignedUBWeight)); | |
55 | 3760 | int32x4_t b_hl = vreinterpretq_s32_u32(vmull_n_u16( | |
56 | 1880 | vget_low_u16(vreinterpretq_u16_s16(u4_h)), kUnsignedUBWeight)); | |
57 | 3760 | int32x4_t b_lh = vreinterpretq_s32_u32( | |
58 | 1880 | vmull_high_n_u16(vreinterpretq_u16_s16(u4_l), kUnsignedUBWeight)); | |
59 | 3760 | int32x4_t b_hh = vreinterpretq_s32_u32( | |
60 | 1880 | vmull_high_n_u16(vreinterpretq_u16_s16(u4_h), kUnsignedUBWeight)); | |
61 | |||
62 | 1880 | b_ll = vaddq(b_ll, b_delta4_); | |
63 | 1880 | b_hl = vaddq(b_hl, b_delta4_); | |
64 | 1880 | b_lh = vaddq(b_lh, b_delta4_); | |
65 | 1880 | b_hh = vaddq(b_hh, b_delta4_); | |
66 | |||
67 | 3760 | int16x8_t b_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(b_ll), | |
68 | 1880 | vreinterpretq_s16_s32(b_lh))); | |
69 | 3760 | int16x8_t b_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(b_hl), | |
70 | 1880 | vreinterpretq_s16_s32(b_hh))); | |
71 | |||
72 | 1880 | b = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h)); | |
73 | 1880 | } | |
74 | |||
75 | // Compute G value in 32-bit precision | ||
76 | { | ||
77 | 1880 | int32x4_t g_ll = vmlal_n_s16(g_delta4_, vget_low_s16(u4_l), kUGWeight); | |
78 | 1880 | int32x4_t g_hl = vmlal_n_s16(g_delta4_, vget_low_s16(u4_h), kUGWeight); | |
79 | 1880 | int32x4_t g_lh = vmlal_high_n_s16(g_delta4_, u4_l, kUGWeight); | |
80 | 1880 | int32x4_t g_hh = vmlal_high_n_s16(g_delta4_, u4_h, kUGWeight); | |
81 | |||
82 | 1880 | g_ll = vmlal_n_s16(g_ll, vget_low_s16(v4_l), kVGWeight); | |
83 | 1880 | g_hl = vmlal_n_s16(g_hl, vget_low_s16(v4_h), kVGWeight); | |
84 | 1880 | g_lh = vmlal_high_n_s16(g_lh, v4_l, kVGWeight); | |
85 | 1880 | g_hh = vmlal_high_n_s16(g_hh, v4_h, kVGWeight); | |
86 | |||
87 | 3760 | int16x8_t g_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(g_ll), | |
88 | 1880 | vreinterpretq_s16_s32(g_lh))); | |
89 | 3760 | int16x8_t g_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(g_hl), | |
90 | 1880 | vreinterpretq_s16_s32(g_hh))); | |
91 | |||
92 | 1880 | g = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h)); | |
93 | 1880 | } | |
94 | |||
95 | // Compute R value in 32-bit precision | ||
96 | { | ||
97 | 1880 | int32x4_t r_ll = vmlal_n_s16(r_delta4_, vget_low_s16(v4_l), kVRWeight); | |
98 | 1880 | int32x4_t r_hl = vmlal_n_s16(r_delta4_, vget_low_s16(v4_h), kVRWeight); | |
99 | 1880 | int32x4_t r_lh = vmlal_high_n_s16(r_delta4_, v4_l, kVRWeight); | |
100 | 1880 | int32x4_t r_hh = vmlal_high_n_s16(r_delta4_, v4_h, kVRWeight); | |
101 | |||
102 | 3760 | int16x8_t r_l = vaddq(y_l, vuzp2q_s16(vreinterpretq_s16_s32(r_ll), | |
103 | 1880 | vreinterpretq_s16_s32(r_lh))); | |
104 | 3760 | int16x8_t r_h = vaddq(y_h, vuzp2q_s16(vreinterpretq_s16_s32(r_hl), | |
105 | 1880 | vreinterpretq_s16_s32(r_hh))); | |
106 | |||
107 | 1880 | r = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h)); | |
108 | 1880 | } | |
109 | |||
110 | 1880 | RawDestinationVectorType rgb; | |
111 | 1880 | rgb.val[r_index_] = r; | |
112 | 1880 | rgb.val[g_index_] = g; | |
113 | 1880 | rgb.val[b_index_] = b; | |
114 | if constexpr (kAlpha) { | ||
115 | 940 | rgb.val[alpha_index_] = vdupq_n_u8(alpha_value); | |
116 | // Store interleaved RGBA pixels to memory. | ||
117 | 940 | vst4q_u8(dst, rgb); | |
118 | } else { | ||
119 | // Store interleaved RGB pixels to memory. | ||
120 | 940 | vst3q_u8(dst, rgb); | |
121 | } | ||
122 | 1880 | } | |
123 | |||
124 | 412 | void scalar_path(const ScalarType *src, ScalarType *dst) { | |
125 | 412 | int32_t y = static_cast<int32_t>(src[0]); | |
126 | 412 | int32_t u = static_cast<int32_t>(src[1]); | |
127 | 412 | int32_t v = static_cast<int32_t>(src[2]); | |
128 | 412 | int32_t b = y + rounding_shift_right((u - 128) * kUBWeight, kWeightScale); | |
129 | 824 | int32_t g = | |
130 | 412 | y + rounding_shift_right((u - 128) * kUGWeight + (v - 128) * kVGWeight, | |
131 | kWeightScale); | ||
132 | 412 | int32_t r = y + rounding_shift_right((v - 128) * kVRWeight, kWeightScale); | |
133 | 412 | dst[r_index_] = saturating_cast<int32_t, uint8_t>(r); | |
134 | 412 | dst[g_index_] = saturating_cast<int32_t, uint8_t>(g); | |
135 | 412 | dst[b_index_] = saturating_cast<int32_t, uint8_t>(b); | |
136 | if constexpr (kAlpha) { | ||
137 | 206 | dst[alpha_index_] = alpha_value; | |
138 | } | ||
139 | 412 | } | |
140 | |||
141 | private: | ||
142 | static constexpr size_t r_index_ = BGR ? 2 : 0; | ||
143 | static constexpr size_t g_index_ = 1; | ||
144 | static constexpr size_t b_index_ = BGR ? 0 : 2; | ||
145 | static constexpr size_t alpha_index_ = 3; | ||
146 | static constexpr uint8_t alpha_value = std::numeric_limits<uint8_t>::max(); | ||
147 | int32x4_t b_delta4_, g_delta4_, r_delta4_; | ||
148 | }; // end of class YUVToRGBAll<bool BGR> | ||
149 | |||
150 | template <typename OperationType, typename ScalarType> | ||
151 | 356 | kleidicv_error_t yuv2rgb_operation(OperationType &operation, | |
152 | const ScalarType *src, size_t src_stride, | ||
153 | ScalarType *dst, size_t dst_stride, | ||
154 | size_t width, size_t height) { | ||
155 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 85 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 85 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 85 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 85 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 85 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 85 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 85 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 85 times.
|
356 | CHECK_POINTER_AND_STRIDE(src, src_stride, height); |
156 |
16/16✓ Branch 0 taken 4 times.
✓ Branch 1 taken 81 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 81 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 81 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 81 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 81 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 81 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 81 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 81 times.
|
340 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, height); |
157 |
24/24✓ Branch 0 taken 4 times.
✓ Branch 1 taken 77 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 73 times.
✓ Branch 4 taken 8 times.
✓ Branch 5 taken 73 times.
✓ Branch 6 taken 4 times.
✓ Branch 7 taken 77 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 73 times.
✓ Branch 10 taken 8 times.
✓ Branch 11 taken 73 times.
✓ Branch 12 taken 4 times.
✓ Branch 13 taken 77 times.
✓ Branch 14 taken 4 times.
✓ Branch 15 taken 73 times.
✓ Branch 16 taken 8 times.
✓ Branch 17 taken 73 times.
✓ Branch 18 taken 4 times.
✓ Branch 19 taken 77 times.
✓ Branch 20 taken 4 times.
✓ Branch 21 taken 73 times.
✓ Branch 22 taken 8 times.
✓ Branch 23 taken 73 times.
|
324 | CHECK_IMAGE_SIZE(width, height); |
158 | |||
159 | 292 | Rectangle rect{width, height}; | |
160 | 292 | Rows src_rows{src, src_stride, 3}; | |
161 | 292 | Rows dst_rows{dst, dst_stride, operation.output_channels()}; | |
162 | |||
163 | 292 | apply_operation_by_rows(operation, rect, src_rows, dst_rows); | |
164 | 292 | return KLEIDICV_OK; | |
165 | 356 | } | |
166 | |||
167 | using YUVToRGB = YUVToRGBAll<false, false>; | ||
168 | using YUVToRGBA = YUVToRGBAll<false, true>; | ||
169 | using YUVToBGR = YUVToRGBAll<true, false>; | ||
170 | using YUVToBGRA = YUVToRGBAll<true, true>; | ||
171 | |||
172 | KLEIDICV_TARGET_FN_ATTRS | ||
173 | 89 | kleidicv_error_t yuv_to_rgb_u8(const uint8_t *src, size_t src_stride, | |
174 | uint8_t *dst, size_t dst_stride, size_t width, | ||
175 | size_t height) { | ||
176 | 89 | YUVToRGB operation; | |
177 | 267 | return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, | |
178 | 89 | height); | |
179 | 89 | } | |
180 | |||
181 | KLEIDICV_TARGET_FN_ATTRS | ||
182 | 89 | kleidicv_error_t yuv_to_rgba_u8(const uint8_t *src, size_t src_stride, | |
183 | uint8_t *dst, size_t dst_stride, size_t width, | ||
184 | size_t height) { | ||
185 | 89 | YUVToRGBA operation; | |
186 | 267 | return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, | |
187 | 89 | height); | |
188 | 89 | } | |
189 | |||
190 | KLEIDICV_TARGET_FN_ATTRS | ||
191 | 89 | kleidicv_error_t yuv_to_bgr_u8(const uint8_t *src, size_t src_stride, | |
192 | uint8_t *dst, size_t dst_stride, size_t width, | ||
193 | size_t height) { | ||
194 | 89 | YUVToBGR operation; | |
195 | 267 | return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, | |
196 | 89 | height); | |
197 | 89 | } | |
198 | |||
199 | KLEIDICV_TARGET_FN_ATTRS | ||
200 | 89 | kleidicv_error_t yuv_to_bgra_u8(const uint8_t *src, size_t src_stride, | |
201 | uint8_t *dst, size_t dst_stride, size_t width, | ||
202 | size_t height) { | ||
203 | 89 | YUVToBGRA operation; | |
204 | 267 | return yuv2rgb_operation(operation, src, src_stride, dst, dst_stride, width, | |
205 | 89 | height); | |
206 | 89 | } | |
207 | |||
208 | } // namespace kleidicv::neon | ||
209 |