| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | |||
| 10 | #include "kleidicv/sve2.h" | ||
| 11 | #include "kleidicv/types.h" | ||
| 12 | #include "transform_common.h" | ||
| 13 | |||
| 14 | namespace kleidicv::sve2 { | ||
| 15 | |||
| 16 | template <typename ScalarType, bool IsLarge> | ||
| 17 | 1686734 | svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y, | |
| 18 | svuint32_t sv_src_stride, | ||
| 19 | Rows<const ScalarType> &src_rows) { | ||
| 20 | if constexpr (IsLarge) { | ||
| 21 | 7380 | svbool_t pg_b = pg; | |
| 22 | 7380 | svbool_t pg_t = svtrn2_b32(pg, svpfalse()); | |
| 23 | |||
| 24 | // Calculate offsets from coordinates (y * stride + x) | ||
| 25 | // To avoid losing precision, the final offsets should be in 64 bits | ||
| 26 | 7380 | svuint64_t result_b, result_t; | |
| 27 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 28 | 6180 | svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride); | |
| 29 | 6180 | svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride); | |
| 30 | // Copy pixels from source | ||
| 31 | 6180 | result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b); | |
| 32 | 6180 | result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t); | |
| 33 | 6180 | } | |
| 34 | if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
| 35 | // Multiply x with sizeof(uint16_t) | ||
| 36 | 1200 | svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); | |
| 37 | 1200 | svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); | |
| 38 | // Copy pixels from source | ||
| 39 | 1200 | result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b); | |
| 40 | 1200 | result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t); | |
| 41 | 1200 | } | |
| 42 | 22140 | return svtrn1_u32(svreinterpret_u32_u64(result_b), | |
| 43 | 7380 | svreinterpret_u32_u64(result_t)); | |
| 44 | 7380 | } else { | |
| 45 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 46 | 1613158 | svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); | |
| 47 | 3226316 | return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets); | |
| 48 | 1613158 | } else { | |
| 49 | // Multiply by sizeof(uint16_t) | ||
| 50 | 66196 | x = svlsl_x(pg, x, 1); | |
| 51 | 66196 | svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); | |
| 52 | 132392 | return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets); | |
| 53 | 66196 | } | |
| 54 | } | ||
| 55 | } | ||
| 56 | |||
| 57 | template <typename ScalarType, bool IsLarge> | ||
| 58 | 143792 | svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y, | |
| 59 | svuint32_t sv_src_stride, | ||
| 60 | Rows<const ScalarType> &src_rows, | ||
| 61 | [[maybe_unused]] svuint8_t load_table) { | ||
| 62 | if constexpr (IsLarge) { | ||
| 63 | 3600 | svbool_t pg_b = pg; | |
| 64 | 3600 | svbool_t pg_t = svtrn2_b32(pg, svpfalse()); | |
| 65 | |||
| 66 | // Calculate offsets from coordinates (y * stride + x) | ||
| 67 | // To avoid losing precision, the final offsets should be in 64 bits | ||
| 68 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 69 | // Multiply x with the number of channels | ||
| 70 | 2400 | svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride); | |
| 71 | 2400 | svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride); | |
| 72 | // Copy pixels from source | ||
| 73 | 4800 | svuint64_t b = svld1uh_gather_offset_u64( | |
| 74 | 2400 | pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b); | |
| 75 | 4800 | svuint64_t t = svld1uh_gather_offset_u64( | |
| 76 | 2400 | pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t); | |
| 77 | 4800 | svuint32_t r32 = | |
| 78 | 2400 | svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t)); | |
| 79 | 4800 | return svreinterpret_u32_u8( | |
| 80 | 2400 | svtbl_u8(svreinterpret_u8_u32(r32), load_table)); | |
| 81 | 2400 | } | |
| 82 | if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
| 83 | // Multiply x with the number of channels and sizeof(uint16_t) | ||
| 84 | 1200 | svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride); | |
| 85 | 1200 | svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride); | |
| 86 | // Copy pixels from source | ||
| 87 | 2400 | svuint64_t result_b = svld1uw_gather_offset_u64( | |
| 88 | 1200 | pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b); | |
| 89 | 2400 | svuint64_t result_t = svld1uw_gather_offset_u64( | |
| 90 | 1200 | pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t); | |
| 91 | 3600 | return svtrn1_u32(svreinterpret_u32_u64(result_b), | |
| 92 | 1200 | svreinterpret_u32_u64(result_t)); | |
| 93 | 1200 | } | |
| 94 | 3600 | } else { | |
| 95 | // Multiply x with the number of channels and sizeof(ScalarType) | ||
| 96 | // This shifting formula is only correct for 8 and 16 bits | ||
| 97 | 140192 | x = svlsl_n_u32_x(pg, x, sizeof(ScalarType)); | |
| 98 | 140192 | svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride); | |
| 99 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 100 | 145592 | svuint32_t r32 = svld1uh_gather_offset_u32( | |
| 101 | 72796 | pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets); | |
| 102 | 145592 | return svreinterpret_u32_u8( | |
| 103 | 72796 | svtbl_u8(svreinterpret_u8_u32(r32), load_table)); | |
| 104 | 72796 | } | |
| 105 | if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
| 106 | 134792 | return svld1_gather_u32offset_u32( | |
| 107 | 67396 | pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets); | |
| 108 | } | ||
| 109 | 140192 | } | |
| 110 | } | ||
| 111 | |||
| 112 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
| 113 | 247686 | svuint32_t inline calculate_linear_replicated_border( | |
| 114 | svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf, | ||
| 115 | svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows, | ||
| 116 | svuint8_t load_table_2ch) { | ||
| 117 | 247686 | svbool_t pg_all32 = svptrue_b32(); | |
| 118 | |||
| 119 | 1238430 | auto load_source = [&](svuint32_t x, svuint32_t y) { | |
| 120 | if constexpr (Channels == 1) { | ||
| 121 | 920824 | return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows); | |
| 122 | } | ||
| 123 | if constexpr (Channels == 2) { | ||
| 124 | 139840 | return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows, | |
| 125 | 69920 | load_table_2ch); | |
| 126 | } | ||
| 127 | }; | ||
| 128 | |||
| 129 | 247686 | svfloat32_t xf = svget2(coords, 0); | |
| 130 | 247686 | svfloat32_t yf = svget2(coords, 1); | |
| 131 | // Take the integer part, clamp it to within the dimensions of the | ||
| 132 | // source image (negative values are already saturated to 0) | ||
| 133 | 247686 | svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf)); | |
| 134 | 247686 | svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf)); | |
| 135 | |||
| 136 | // Get fractional part, or 0 if out of range | ||
| 137 | 495372 | svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F), | |
| 138 | 247686 | svcmplt_f32(pg_all32, xf, xmaxf)); | |
| 139 | 495372 | svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F), | |
| 140 | 247686 | svcmplt_f32(pg_all32, yf, ymaxf)); | |
| 141 | 495372 | svfloat32_t xfrac = | |
| 142 | 495372 | svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)), | |
| 143 | 247686 | svdup_n_f32(0.F)); | |
| 144 | 495372 | svfloat32_t yfrac = | |
| 145 | 495372 | svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)), | |
| 146 | 247686 | svdup_n_f32(0.F)); | |
| 147 | |||
| 148 | // x1 = x0 + 1, except if it's already xmax or out of range | ||
| 149 | 247686 | svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0); | |
| 150 | 247686 | svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0); | |
| 151 | |||
| 152 | 512852 | auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, | |
| 153 | svuint32_t di) { | ||
| 154 | 265166 | svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); | |
| 155 | 265166 | svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); | |
| 156 | 530332 | svfloat32_t line0 = | |
| 157 | 265166 | svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); | |
| 158 | 265166 | svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); | |
| 159 | 265166 | svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); | |
| 160 | 530332 | svfloat32_t line1 = | |
| 161 | 265166 | svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); | |
| 162 | 530332 | svfloat32_t result = svmla_f32_x( | |
| 163 | 265166 | pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); | |
| 164 | 530332 | return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); | |
| 165 | 265166 | }; | |
| 166 | |||
| 167 | // Calculate offsets from coordinates (y * stride + x) | ||
| 168 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 169 | 247686 | svuint32_t a = load_source(x0, y0); | |
| 170 | 247686 | svuint32_t b = load_source(x1, y0); | |
| 171 | 247686 | svuint32_t c = load_source(x0, y1); | |
| 172 | 247686 | svuint32_t d = load_source(x1, y1); | |
| 173 | if constexpr (Channels == 1) { | ||
| 174 | 460412 | return lerp_2d(a, b, c, d); | |
| 175 | } | ||
| 176 | if constexpr (Channels == 2) { | ||
| 177 | // Channel 0 | ||
| 178 | 34960 | svuint32_t res32_0 = lerp_2d( | |
| 179 | 17480 | svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), | |
| 180 | 17480 | svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); | |
| 181 | // Channel 1 | ||
| 182 | 34960 | svuint32_t res32_1 = lerp_2d( | |
| 183 | 17480 | svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), | |
| 184 | 17480 | svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); | |
| 185 | |||
| 186 | 52440 | return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), | |
| 187 | 17480 | svreinterpret_u16_u32(res32_1))); | |
| 188 | 17480 | } | |
| 189 | 247686 | } | |
| 190 | |||
| 191 | template <typename ScalarType, bool IsLarge> | ||
| 192 | 473152 | svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y, | |
| 193 | svuint32_t sv_border, svuint32_t sv_xmax, | ||
| 194 | svuint32_t sv_ymax, svuint32_t sv_src_stride, | ||
| 195 | Rows<const ScalarType> &src_rows) { | ||
| 196 | 946304 | svbool_t in_range = | |
| 197 | 473152 | svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); | |
| 198 | 946304 | svuint32_t result = | |
| 199 | 473152 | load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows); | |
| 200 | // Select between source pixels and border colour | ||
| 201 | 946304 | return svsel_u32(in_range, result, sv_border); | |
| 202 | 473152 | } | |
| 203 | |||
| 204 | template <typename ScalarType, bool IsLarge> | ||
| 205 | 69920 | svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y, | |
| 206 | svuint32_t sv_border, svuint32_t sv_xmax, | ||
| 207 | svuint32_t sv_ymax, | ||
| 208 | svuint32_t sv_src_stride, | ||
| 209 | Rows<const ScalarType> &src_rows, | ||
| 210 | svuint8_t load_table) { | ||
| 211 | 139840 | svbool_t in_range = | |
| 212 | 69920 | svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax)); | |
| 213 | 139840 | svuint32_t result = load_xy_2ch<ScalarType, IsLarge>( | |
| 214 | 69920 | in_range, x, y, sv_src_stride, src_rows, load_table); | |
| 215 | // Select between source pixels and border colour | ||
| 216 | 139840 | return svsel_u32(in_range, result, sv_border); | |
| 217 | 69920 | } | |
| 218 | |||
| 219 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
| 220 | 135768 | svuint32_t inline calculate_linear_constant_border( | |
| 221 | svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax, | ||
| 222 | svuint32_t sv_ymax, svuint32_t sv_src_stride, | ||
| 223 | Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) { | ||
| 224 | 135768 | svbool_t pg_all32 = svptrue_b32(); | |
| 225 | |||
| 226 | 678840 | auto load_source = [&](svuint32_t x, svuint32_t y) { | |
| 227 | if constexpr (Channels == 1) { | ||
| 228 | 473152 | return get_pixels_or_border<ScalarType, IsLarge>( | |
| 229 | 473152 | pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows); | |
| 230 | } | ||
| 231 | if constexpr (Channels == 2) { | ||
| 232 | 69920 | return get_pixels_or_border_2ch<ScalarType, IsLarge>( | |
| 233 | 69920 | pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows, | |
| 234 | 69920 | load_table_2ch); | |
| 235 | } | ||
| 236 | }; | ||
| 237 | |||
| 238 | // Convert coordinates to integers, truncating towards minus infinity. | ||
| 239 | // Negative numbers will become large positive numbers. | ||
| 240 | // Since the source width and height is known to be <=2^24 these large | ||
| 241 | // positive numbers will always be treated as outside the source image | ||
| 242 | // bounds. | ||
| 243 | 135768 | svuint32_t x0, y0, x1, y1; | |
| 244 | 135768 | svfloat32_t xfrac, yfrac; | |
| 245 | { | ||
| 246 | 135768 | svfloat32_t xf = svget2(coords, 0); | |
| 247 | 135768 | svfloat32_t yf = svget2(coords, 1); | |
| 248 | 135768 | svfloat32_t xf0 = svrintm_f32_x(pg, xf); | |
| 249 | 135768 | svfloat32_t yf0 = svrintm_f32_x(pg, yf); | |
| 250 | 135768 | x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0)); | |
| 251 | 135768 | y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0)); | |
| 252 | 135768 | x1 = svadd_u32_x(pg, x0, svdup_n_u32(1)); | |
| 253 | 135768 | y1 = svadd_u32_x(pg, y0, svdup_n_u32(1)); | |
| 254 | |||
| 255 | 135768 | xfrac = svsub_f32_x(pg, xf, xf0); | |
| 256 | 135768 | yfrac = svsub_f32_x(pg, yf, yf0); | |
| 257 | 135768 | } | |
| 258 | |||
| 259 | 289016 | auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci, | |
| 260 | svuint32_t di) { | ||
| 261 | 153248 | svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai); | |
| 262 | 153248 | svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi); | |
| 263 | 306496 | svfloat32_t line0 = | |
| 264 | 153248 | svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac); | |
| 265 | 153248 | svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci); | |
| 266 | 153248 | svfloat32_t d = svcvt_f32_u32_x(pg_all32, di); | |
| 267 | 306496 | svfloat32_t line1 = | |
| 268 | 153248 | svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac); | |
| 269 | 306496 | svfloat32_t result = svmla_f32_x( | |
| 270 | 153248 | pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac); | |
| 271 | 306496 | return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F)); | |
| 272 | 153248 | }; | |
| 273 | |||
| 274 | // Calculate offsets from coordinates (y * stride + x) | ||
| 275 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 276 | 135768 | svuint32_t a = load_source(x0, y0); | |
| 277 | 135768 | svuint32_t b = load_source(x1, y0); | |
| 278 | 135768 | svuint32_t c = load_source(x0, y1); | |
| 279 | 135768 | svuint32_t d = load_source(x1, y1); | |
| 280 | if constexpr (Channels == 1) { | ||
| 281 | 236576 | return lerp_2d(a, b, c, d); | |
| 282 | } | ||
| 283 | if constexpr (Channels == 2) { | ||
| 284 | // Channel 0 | ||
| 285 | 34960 | svuint32_t res32_0 = lerp_2d( | |
| 286 | 17480 | svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)), | |
| 287 | 17480 | svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d))); | |
| 288 | // Channel 1 | ||
| 289 | 34960 | svuint32_t res32_1 = lerp_2d( | |
| 290 | 17480 | svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)), | |
| 291 | 17480 | svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d))); | |
| 292 | |||
| 293 | 52440 | return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0), | |
| 294 | 17480 | svreinterpret_u16_u32(res32_1))); | |
| 295 | 17480 | } | |
| 296 | 135768 | } | |
| 297 | |||
| 298 | } // namespace kleidicv::sve2 | ||
| 299 |