| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include "kleidicv/ctypes.h" | ||
| 6 | #include "kleidicv/neon.h" | ||
| 7 | #include "kleidicv/types.h" | ||
| 8 | #include "transform_common.h" | ||
| 9 | |||
| 10 | namespace kleidicv::neon { | ||
| 11 | |||
| 12 | typedef struct { | ||
| 13 | float32x4_t x, y; | ||
| 14 | } FloatVectorPair; | ||
| 15 | |||
| 16 | template <typename ScalarType, bool IsLarge> | ||
| 17 | 920824 | float32x4_t inline load_xy(uint32x4_t x, uint32x4_t y, uint32x4_t v_src_stride, | |
| 18 | Rows<const ScalarType>& src_rows) { | ||
| 19 | if constexpr (IsLarge) { | ||
| 20 | 6912 | uint64x2_t indices_low = | |
| 21 | 6912 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 22 | 3456 | vget_low_u32(v_src_stride)); | |
| 23 | 6912 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 24 | 3456 | vget_low_u32(v_src_stride)); | |
| 25 | 6912 | uint64_t acc = | |
| 26 | 6912 | static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 0)]) | | |
| 27 | 3456 | (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_low, 1)]) << 32); | |
| 28 | 3456 | uint64x2_t rawsrc = vdupq_n_u64(acc); | |
| 29 | 6912 | acc = static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 0)]) | | |
| 30 | 3456 | (static_cast<uint64_t>(src_rows[vgetq_lane_u64(indices_high, 1)]) | |
| 31 | 3456 | << 32); | |
| 32 | 3456 | rawsrc = vsetq_lane_u64(acc, rawsrc, 1); | |
| 33 | 6912 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
| 34 | 3456 | } else { | |
| 35 | 917368 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
| 36 | 1834736 | uint64_t acc = | |
| 37 | 1834736 | static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 0)]) | | |
| 38 | 917368 | (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 1)]) << 32); | |
| 39 | 917368 | uint64x2_t rawsrc = vdupq_n_u64(acc); | |
| 40 | 1834736 | acc = static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 2)]) | | |
| 41 | 917368 | (static_cast<uint64_t>(src_rows[vgetq_lane_u32(indices, 3)]) << 32); | |
| 42 | 917368 | rawsrc = vsetq_lane_u64(acc, rawsrc, 1); | |
| 43 | 1834736 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
| 44 | 917368 | } | |
| 45 | } | ||
| 46 | |||
| 47 | template <typename ScalarType, bool IsLarge> | ||
| 48 | 69920 | float32x4x2_t inline load_xy_2ch(uint32x4_t x, uint32x4_t y, | |
| 49 | uint32x4_t v_src_stride, | ||
| 50 | Rows<const ScalarType>& src_rows) { | ||
| 51 | 69920 | const size_t kBytes = 2 * sizeof(ScalarType); | |
| 52 | 69920 | ScalarType elements[4 * 2]; // 4 pixels, 2 channels | |
| 53 | // Multiply x with the number of channels (2) | ||
| 54 | 69920 | x = vshlq_n_u32(x, 1); | |
| 55 | if constexpr (IsLarge) { | ||
| 56 | 2880 | uint64x2_t indices_low = | |
| 57 | 2880 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 58 | 1440 | vget_low_u32(v_src_stride)); | |
| 59 | 2880 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 60 | 1440 | vget_low_u32(v_src_stride)); | |
| 61 | 1440 | memcpy(&elements[0], &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); | |
| 62 | 1440 | memcpy(&elements[2], &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); | |
| 63 | 1440 | memcpy(&elements[4], &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); | |
| 64 | 1440 | memcpy(&elements[6], &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); | |
| 65 | 1440 | } else { | |
| 66 | 68480 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
| 67 | 68480 | memcpy(&elements[0], &src_rows[vgetq_lane_u32(indices, 0)], kBytes); | |
| 68 | 68480 | memcpy(&elements[2], &src_rows[vgetq_lane_u32(indices, 1)], kBytes); | |
| 69 | 68480 | memcpy(&elements[4], &src_rows[vgetq_lane_u32(indices, 2)], kBytes); | |
| 70 | 68480 | memcpy(&elements[6], &src_rows[vgetq_lane_u32(indices, 3)], kBytes); | |
| 71 | 68480 | } | |
| 72 | 69920 | uint16x8_t pixels16{}; | |
| 73 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 74 | 36280 | pixels16 = vmovl_u8(vld1_u8(elements)); | |
| 75 | } else if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
| 76 | 33640 | pixels16 = vld1q_u16(elements); | |
| 77 | } | ||
| 78 | float32x4x2_t result; | ||
| 79 | 69920 | result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); | |
| 80 | 69920 | result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); | |
| 81 | return result; | ||
| 82 | 69920 | } | |
| 83 | |||
| 84 | template <typename ScalarType, bool IsLarge> | ||
| 85 | 473152 | float32x4_t inline load_xy_or_border(uint32x4_t x, uint32x4_t y, | |
| 86 | uint32x4_t in_range, | ||
| 87 | ScalarType border_value, | ||
| 88 | uint32x4_t v_src_stride, | ||
| 89 | Rows<const ScalarType> src_rows) { | ||
| 90 | if constexpr (IsLarge) { | ||
| 91 | 4896 | uint64x2_t indices_low = | |
| 92 | 4896 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 93 | 2448 | vget_low_u32(v_src_stride)); | |
| 94 | 4896 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 95 | 2448 | vget_low_u32(v_src_stride)); | |
| 96 |
4/4✓ Branch 0 taken 1220 times.
✓ Branch 1 taken 748 times.
✓ Branch 2 taken 117 times.
✓ Branch 3 taken 363 times.
|
2448 | uint64_t pixel0 = vgetq_lane_u32(in_range, 0) |
| 97 | 1337 | ? src_rows[vgetq_lane_u64(indices_low, 0)] | |
| 98 | 1111 | : border_value; | |
| 99 |
4/4✓ Branch 0 taken 1191 times.
✓ Branch 1 taken 777 times.
✓ Branch 2 taken 105 times.
✓ Branch 3 taken 375 times.
|
2448 | uint64_t pixel1 = vgetq_lane_u32(in_range, 1) |
| 100 | 1296 | ? src_rows[vgetq_lane_u64(indices_low, 1)] | |
| 101 | 1152 | : border_value; | |
| 102 |
4/4✓ Branch 0 taken 1189 times.
✓ Branch 1 taken 779 times.
✓ Branch 2 taken 95 times.
✓ Branch 3 taken 385 times.
|
2448 | uint64_t pixel2 = vgetq_lane_u32(in_range, 2) |
| 103 | 1284 | ? src_rows[vgetq_lane_u64(indices_high, 0)] | |
| 104 | 1164 | : border_value; | |
| 105 |
4/4✓ Branch 0 taken 1203 times.
✓ Branch 1 taken 765 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 368 times.
|
2448 | uint64_t pixel3 = vgetq_lane_u32(in_range, 3) |
| 106 | 1315 | ? src_rows[vgetq_lane_u64(indices_high, 1)] | |
| 107 | 1133 | : border_value; | |
| 108 | 2448 | uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
| 109 | 2448 | rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
| 110 | 2448 | rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); | |
| 111 | 4896 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
| 112 | 2448 | } else { | |
| 113 | 470704 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
| 114 |
4/4✓ Branch 0 taken 217042 times.
✓ Branch 1 taken 220982 times.
✓ Branch 2 taken 31510 times.
✓ Branch 3 taken 1170 times.
|
470704 | uint64_t pixel0 = vgetq_lane_u32(in_range, 0) |
| 115 | 248552 | ? src_rows[vgetq_lane_u32(indices, 0)] | |
| 116 | 222152 | : border_value; | |
| 117 |
4/4✓ Branch 0 taken 216282 times.
✓ Branch 1 taken 221742 times.
✓ Branch 2 taken 31516 times.
✓ Branch 3 taken 1164 times.
|
470704 | uint64_t pixel1 = vgetq_lane_u32(in_range, 1) |
| 118 | 247798 | ? src_rows[vgetq_lane_u32(indices, 1)] | |
| 119 | 222906 | : border_value; | |
| 120 |
4/4✓ Branch 0 taken 215275 times.
✓ Branch 1 taken 222749 times.
✓ Branch 2 taken 31536 times.
✓ Branch 3 taken 1144 times.
|
470704 | uint64_t pixel2 = vgetq_lane_u32(in_range, 2) |
| 121 | 246811 | ? src_rows[vgetq_lane_u32(indices, 2)] | |
| 122 | 223893 | : border_value; | |
| 123 |
4/4✓ Branch 0 taken 214211 times.
✓ Branch 1 taken 223813 times.
✓ Branch 2 taken 31440 times.
✓ Branch 3 taken 1240 times.
|
470704 | uint64_t pixel3 = vgetq_lane_u32(in_range, 3) |
| 124 | 245651 | ? src_rows[vgetq_lane_u32(indices, 3)] | |
| 125 | 225053 | : border_value; | |
| 126 | 470704 | uint64x2_t rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
| 127 | 470704 | rawsrc = vdupq_n_u64(pixel0 | (pixel1 << 32)); | |
| 128 | 470704 | rawsrc = vsetq_lane_u64(pixel2 | (pixel3 << 32), rawsrc, 1); | |
| 129 | 941408 | return vcvtq_f32_u32(vreinterpretq_u32_u64(rawsrc)); | |
| 130 | 470704 | } | |
| 131 | } | ||
| 132 | |||
| 133 | template <typename ScalarType, bool IsLarge> | ||
| 134 | 69920 | float32x4x2_t inline load_xy_or_border_2ch(uint32x4_t x, uint32x4_t y, | |
| 135 | uint32x4_t in_range, | ||
| 136 | const ScalarType* border_values, | ||
| 137 | uint32x4_t v_src_stride, | ||
| 138 | Rows<const ScalarType> src_rows) { | ||
| 139 | 69920 | const size_t kBytes = 2 * sizeof(ScalarType); | |
| 140 | 69920 | const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; | |
| 141 | 69920 | ScalarType elements[4 * 2]; // 4 pixels, 2 channels | |
| 142 | // Multiply x with the number of channels | ||
| 143 | 69920 | x = vshlq_n_u32(x, 1); | |
| 144 | if constexpr (IsLarge) { | ||
| 145 | 2880 | uint64x2_t indices_low = | |
| 146 | 2880 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 147 | 1440 | vget_low_u32(v_src_stride)); | |
| 148 | 2880 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 149 | 1440 | vget_low_u32(v_src_stride)); | |
| 150 |
4/4✓ Branch 0 taken 236 times.
✓ Branch 1 taken 724 times.
✓ Branch 2 taken 117 times.
✓ Branch 3 taken 363 times.
|
1440 | pixel0 = vgetq_lane_u32(in_range, 0) |
| 151 | 353 | ? &src_rows[vgetq_lane_u64(indices_low, 0)] | |
| 152 | 1087 | : border_values; | |
| 153 |
4/4✓ Branch 0 taken 207 times.
✓ Branch 1 taken 753 times.
✓ Branch 2 taken 105 times.
✓ Branch 3 taken 375 times.
|
1440 | pixel1 = vgetq_lane_u32(in_range, 1) |
| 154 | 312 | ? &src_rows[vgetq_lane_u64(indices_low, 1)] | |
| 155 | 1128 | : border_values; | |
| 156 |
4/4✓ Branch 0 taken 205 times.
✓ Branch 1 taken 755 times.
✓ Branch 2 taken 95 times.
✓ Branch 3 taken 385 times.
|
1440 | pixel2 = vgetq_lane_u32(in_range, 2) |
| 157 | 300 | ? &src_rows[vgetq_lane_u64(indices_high, 0)] | |
| 158 | 1140 | : border_values; | |
| 159 |
4/4✓ Branch 0 taken 219 times.
✓ Branch 1 taken 741 times.
✓ Branch 2 taken 112 times.
✓ Branch 3 taken 368 times.
|
1440 | pixel3 = vgetq_lane_u32(in_range, 3) |
| 160 | 331 | ? &src_rows[vgetq_lane_u64(indices_high, 1)] | |
| 161 | 1109 | : border_values; | |
| 162 | 1440 | } else { | |
| 163 | 68480 | uint32x4_t indices = vmlaq_u32(x, y, v_src_stride); | |
| 164 |
4/4✓ Branch 0 taken 32803 times.
✓ Branch 1 taken 2517 times.
✓ Branch 2 taken 31879 times.
✓ Branch 3 taken 1281 times.
|
68480 | pixel0 = vgetq_lane_u32(in_range, 0) ? &src_rows[vgetq_lane_u32(indices, 0)] |
| 165 | 3798 | : border_values; | |
| 166 |
4/4✓ Branch 0 taken 32776 times.
✓ Branch 1 taken 2544 times.
✓ Branch 2 taken 31885 times.
✓ Branch 3 taken 1275 times.
|
68480 | pixel1 = vgetq_lane_u32(in_range, 1) ? &src_rows[vgetq_lane_u32(indices, 1)] |
| 167 | 3819 | : border_values; | |
| 168 |
4/4✓ Branch 0 taken 32711 times.
✓ Branch 1 taken 2609 times.
✓ Branch 2 taken 31898 times.
✓ Branch 3 taken 1262 times.
|
68480 | pixel2 = vgetq_lane_u32(in_range, 2) ? &src_rows[vgetq_lane_u32(indices, 2)] |
| 169 | 3871 | : border_values; | |
| 170 |
4/4✓ Branch 0 taken 32689 times.
✓ Branch 1 taken 2631 times.
✓ Branch 2 taken 31795 times.
✓ Branch 3 taken 1365 times.
|
68480 | pixel3 = vgetq_lane_u32(in_range, 3) ? &src_rows[vgetq_lane_u32(indices, 3)] |
| 171 | 3996 | : border_values; | |
| 172 | 68480 | } | |
| 173 | 69920 | memcpy(&elements[0], pixel0, kBytes); | |
| 174 | 69920 | memcpy(&elements[2], pixel1, kBytes); | |
| 175 | 69920 | memcpy(&elements[4], pixel2, kBytes); | |
| 176 | 69920 | memcpy(&elements[6], pixel3, kBytes); | |
| 177 | 69920 | uint16x8_t pixels16{}; | |
| 178 | if constexpr (std::is_same<ScalarType, uint8_t>::value) { | ||
| 179 | 36280 | pixels16 = vmovl_u8(vld1_u8(elements)); | |
| 180 | } else if constexpr (std::is_same<ScalarType, uint16_t>::value) { | ||
| 181 | 33640 | pixels16 = vld1q_u16(elements); | |
| 182 | } | ||
| 183 | float32x4x2_t result; | ||
| 184 | 69920 | result.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(pixels16))); | |
| 185 | 69920 | result.val[1] = vcvtq_f32_u32(vmovl_high_u16(pixels16)); | |
| 186 | return result; | ||
| 187 | 69920 | } | |
| 188 | |||
| 189 | template <typename ScalarType, bool IsLarge> | ||
| 190 | 230206 | void load_quad_pixels_replicate(FloatVectorPair xy, uint32x4_t v_xmax, | |
| 191 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
| 192 | Rows<const ScalarType> src_rows, | ||
| 193 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
| 194 | float32x4_t& a, float32x4_t& b, float32x4_t& c, | ||
| 195 | float32x4_t& d) { | ||
| 196 | 230206 | auto&& [xf, yf] = xy; | |
| 197 | // Truncating convert to int | ||
| 198 | 230206 | uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); | |
| 199 | 230206 | uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); | |
| 200 | |||
| 201 | // Get fractional part, or 0 if out of range | ||
| 202 | 230206 | float32x4_t zero = vdupq_n_f32(0.F); | |
| 203 | 230206 | uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); | |
| 204 | 230206 | uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); | |
| 205 | 230206 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
| 206 | 230206 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
| 207 | |||
| 208 | // x1 = x0 + 1, except if it's already xmax or out of range | ||
| 209 | 230206 | uint32x4_t x1 = vsubq_u32(x0, x_in_range); | |
| 210 | 230206 | uint32x4_t y1 = vsubq_u32(y0, y_in_range); | |
| 211 | |||
| 212 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 213 | 230206 | a = load_xy<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows); | |
| 214 | 230206 | b = load_xy<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows); | |
| 215 | 230206 | c = load_xy<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows); | |
| 216 | 230206 | d = load_xy<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows); | |
| 217 | 230206 | } | |
| 218 | |||
| 219 | template <typename ScalarType, bool IsLarge> | ||
| 220 | 17480 | void load_quad_pixels_replicate_2ch(FloatVectorPair xy, uint32x4_t v_xmax, | |
| 221 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
| 222 | Rows<const ScalarType> src_rows, | ||
| 223 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
| 224 | float32x4x2_t& a, float32x4x2_t& b, | ||
| 225 | float32x4x2_t& c, float32x4x2_t& d) { | ||
| 226 | 17480 | auto&& [xf, yf] = xy; | |
| 227 | // Truncating convert to int | ||
| 228 | 17480 | uint32x4_t x0 = vminq_u32(vcvtmq_u32_f32(xf), v_xmax); | |
| 229 | 17480 | uint32x4_t y0 = vminq_u32(vcvtmq_u32_f32(yf), v_ymax); | |
| 230 | |||
| 231 | // Get fractional part, or 0 if out of range | ||
| 232 | 17480 | float32x4_t zero = vdupq_n_f32(0.F); | |
| 233 | 17480 | uint32x4_t x_in_range = vandq_u32(vcgeq_f32(xf, zero), vcltq_u32(x0, v_xmax)); | |
| 234 | 17480 | uint32x4_t y_in_range = vandq_u32(vcgeq_f32(yf, zero), vcltq_u32(y0, v_ymax)); | |
| 235 | 17480 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
| 236 | 17480 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
| 237 | |||
| 238 | // x1 = x0 + 1, except if it's already xmax or out of range | ||
| 239 | 17480 | uint32x4_t x1 = vsubq_u32(x0, x_in_range); | |
| 240 | 17480 | uint32x4_t y1 = vsubq_u32(y0, y_in_range); | |
| 241 | |||
| 242 | // a: top left, b: top right, c: bottom left, d: bottom right | ||
| 243 | 17480 | a = load_xy_2ch<ScalarType, IsLarge>(x0, y0, v_src_stride, src_rows); | |
| 244 | 17480 | b = load_xy_2ch<ScalarType, IsLarge>(x1, y0, v_src_stride, src_rows); | |
| 245 | 17480 | c = load_xy_2ch<ScalarType, IsLarge>(x0, y1, v_src_stride, src_rows); | |
| 246 | 17480 | d = load_xy_2ch<ScalarType, IsLarge>(x1, y1, v_src_stride, src_rows); | |
| 247 | 17480 | } | |
| 248 | |||
| 249 | template <typename ScalarType, bool IsLarge> | ||
| 250 | 118288 | void load_quad_pixels_constant(FloatVectorPair xy, uint32x4_t v_xmax, | |
| 251 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
| 252 | const ScalarType* border_values, | ||
| 253 | Rows<const ScalarType> src_rows, | ||
| 254 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
| 255 | float32x4_t& a, float32x4_t& b, float32x4_t& c, | ||
| 256 | float32x4_t& d) { | ||
| 257 | 118288 | auto&& [xf, yf] = xy; | |
| 258 | // Convert coordinates to integers, truncating towards minus infinity. | ||
| 259 | // Negative numbers will become large positive numbers. | ||
| 260 | // Since the source width and height is known to be <=2^24 these large | ||
| 261 | // positive numbers will always be treated as outside the source image | ||
| 262 | // bounds. | ||
| 263 | 118288 | uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); | |
| 264 | 118288 | uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); | |
| 265 | 118288 | uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); | |
| 266 | 118288 | uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); | |
| 267 | 118288 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
| 268 | 118288 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
| 269 | 118288 | uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; | |
| 270 | { | ||
| 271 | 118288 | uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); | |
| 272 | 118288 | uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); | |
| 273 | 118288 | uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); | |
| 274 | 118288 | uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); | |
| 275 | 118288 | a_in_range = vandq(x0_in_range, y0_in_range); | |
| 276 | 118288 | b_in_range = vandq(x1_in_range, y0_in_range); | |
| 277 | 118288 | c_in_range = vandq(x0_in_range, y1_in_range); | |
| 278 | 118288 | d_in_range = vandq(x1_in_range, y1_in_range); | |
| 279 | 118288 | } | |
| 280 | 118288 | a = load_xy_or_border<ScalarType, IsLarge>( | |
| 281 | 118288 | x0, y0, a_in_range, border_values[0], v_src_stride, src_rows); | |
| 282 | 118288 | b = load_xy_or_border<ScalarType, IsLarge>( | |
| 283 | 118288 | x1, y0, b_in_range, border_values[0], v_src_stride, src_rows); | |
| 284 | 118288 | c = load_xy_or_border<ScalarType, IsLarge>( | |
| 285 | 118288 | x0, y1, c_in_range, border_values[0], v_src_stride, src_rows); | |
| 286 | 118288 | d = load_xy_or_border<ScalarType, IsLarge>( | |
| 287 | 118288 | x1, y1, d_in_range, border_values[0], v_src_stride, src_rows); | |
| 288 | 118288 | } | |
| 289 | |||
| 290 | template <typename ScalarType, bool IsLarge> | ||
| 291 | 17480 | void load_quad_pixels_constant_2ch(FloatVectorPair xy, uint32x4_t v_xmax, | |
| 292 | uint32x4_t v_ymax, uint32x4_t v_src_stride, | ||
| 293 | const ScalarType* border_values, | ||
| 294 | Rows<const ScalarType> src_rows, | ||
| 295 | float32x4_t& xfrac, float32x4_t& yfrac, | ||
| 296 | float32x4x2_t& a, float32x4x2_t& b, | ||
| 297 | float32x4x2_t& c, float32x4x2_t& d) { | ||
| 298 | 17480 | auto&& [xf, yf] = xy; | |
| 299 | // Convert coordinates to integers, truncating towards minus infinity. | ||
| 300 | // Negative numbers will become large positive numbers. | ||
| 301 | // Since the source width and height is known to be <=2^24 these large | ||
| 302 | // positive numbers will always be treated as outside the source image | ||
| 303 | // bounds. | ||
| 304 | 17480 | uint32x4_t x0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(xf)); | |
| 305 | 17480 | uint32x4_t y0 = vreinterpretq_u32_s32(vcvtmq_s32_f32(yf)); | |
| 306 | 17480 | uint32x4_t x1 = vaddq(x0, vdupq_n_u32(1)); | |
| 307 | 17480 | uint32x4_t y1 = vaddq(y0, vdupq_n_u32(1)); | |
| 308 | 17480 | xfrac = vsubq_f32(xf, vrndmq_f32(xf)); | |
| 309 | 17480 | yfrac = vsubq_f32(yf, vrndmq_f32(yf)); | |
| 310 | 17480 | uint32x4_t a_in_range, b_in_range, c_in_range, d_in_range; | |
| 311 | { | ||
| 312 | 17480 | uint32x4_t x0_in_range = vcleq_u32(x0, v_xmax); | |
| 313 | 17480 | uint32x4_t y0_in_range = vcleq_u32(y0, v_ymax); | |
| 314 | 17480 | uint32x4_t x1_in_range = vcleq_u32(x1, v_xmax); | |
| 315 | 17480 | uint32x4_t y1_in_range = vcleq_u32(y1, v_ymax); | |
| 316 | 17480 | a_in_range = vandq(x0_in_range, y0_in_range); | |
| 317 | 17480 | b_in_range = vandq(x1_in_range, y0_in_range); | |
| 318 | 17480 | c_in_range = vandq(x0_in_range, y1_in_range); | |
| 319 | 17480 | d_in_range = vandq(x1_in_range, y1_in_range); | |
| 320 | 17480 | } | |
| 321 | 34960 | a = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
| 322 | 17480 | x0, y0, a_in_range, border_values, v_src_stride, src_rows); | |
| 323 | 34960 | b = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
| 324 | 17480 | x1, y0, b_in_range, border_values, v_src_stride, src_rows); | |
| 325 | 34960 | c = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
| 326 | 17480 | x0, y1, c_in_range, border_values, v_src_stride, src_rows); | |
| 327 | 34960 | d = load_xy_or_border_2ch<ScalarType, IsLarge>( | |
| 328 | 17480 | x1, y1, d_in_range, border_values, v_src_stride, src_rows); | |
| 329 | 17480 | } | |
| 330 | |||
| 331 | 418414 | inline uint32x4_t lerp_2d(float32x4_t xfrac, float32x4_t yfrac, float32x4_t a, | |
| 332 | float32x4_t b, float32x4_t c, float32x4_t d) { | ||
| 333 | 418414 | float32x4_t line0 = vmlaq_f32(a, vsubq_f32(b, a), xfrac); | |
| 334 | 418414 | float32x4_t line1 = vmlaq_f32(c, vsubq_f32(d, c), xfrac); | |
| 335 | 418414 | float32x4_t result = vmlaq_f32(line0, vsubq_f32(line1, line0), yfrac); | |
| 336 | 836828 | return vcvtaq_u32_f32(result); | |
| 337 | 418414 | } | |
| 338 | |||
| 339 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
| 340 | 200176 | void transform_pixels_replicate(float32x4_t xf, float32x4_t yf, | |
| 341 | uint32x4_t v_xmax, uint32x4_t v_ymax, | ||
| 342 | uint32x4_t v_src_element_stride, | ||
| 343 | Rows<const ScalarType> src_rows, | ||
| 344 | Columns<ScalarType> dst) { | ||
| 345 | // Round to nearest, with Ties To Away (i.e. round 0.5 up) | ||
| 346 | // Clamp coordinates to within the dimensions of the source image | ||
| 347 | // (vcvtaq already converted negative values to 0) | ||
| 348 | 200176 | uint32x4_t x = vminq_u32(vcvtaq_u32_f32(xf), v_xmax); | |
| 349 | 200176 | uint32x4_t y = vminq_u32(vcvtaq_u32_f32(yf), v_ymax); | |
| 350 | if constexpr (Channels == 2) { | ||
| 351 | // Multiply x with the number of channels | ||
| 352 | 1976 | x = vshlq_n_u32(x, 1); | |
| 353 | } | ||
| 354 | // Copy pixels from source | ||
| 355 | if constexpr (IsLarge) { | ||
| 356 | 2448 | uint64x2_t indices_low = | |
| 357 | 2448 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 358 | 1224 | vget_low_u32(v_src_element_stride)); | |
| 359 | 2448 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 360 | 1224 | vget_low_u32(v_src_element_stride)); | |
| 361 | if constexpr (Channels == 1) { | ||
| 362 | 864 | dst[0] = src_rows[vgetq_lane_u64(indices_low, 0)]; | |
| 363 | 864 | dst[1] = src_rows[vgetq_lane_u64(indices_low, 1)]; | |
| 364 | 864 | dst[2] = src_rows[vgetq_lane_u64(indices_high, 0)]; | |
| 365 | 864 | dst[3] = src_rows[vgetq_lane_u64(indices_high, 1)]; | |
| 366 | } else { | ||
| 367 | 360 | const size_t kBytes = Channels * sizeof(ScalarType); | |
| 368 | 360 | memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u64(indices_low, 0)], kBytes); | |
| 369 | 360 | memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u64(indices_low, 1)], kBytes); | |
| 370 | 360 | memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u64(indices_high, 0)], kBytes); | |
| 371 | 360 | memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u64(indices_high, 1)], kBytes); | |
| 372 | 360 | } | |
| 373 | 1224 | } else { | |
| 374 | 198952 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
| 375 | if constexpr (Channels == 1) { | ||
| 376 | 197336 | dst[0] = src_rows[vgetq_lane_u32(indices, 0)]; | |
| 377 | 197336 | dst[1] = src_rows[vgetq_lane_u32(indices, 1)]; | |
| 378 | 197336 | dst[2] = src_rows[vgetq_lane_u32(indices, 2)]; | |
| 379 | 197336 | dst[3] = src_rows[vgetq_lane_u32(indices, 3)]; | |
| 380 | } else { | ||
| 381 | 1616 | const size_t kBytes = Channels * sizeof(ScalarType); | |
| 382 | 1616 | memcpy(dst.ptr_at(0), &src_rows[vgetq_lane_u32(indices, 0)], kBytes); | |
| 383 | 1616 | memcpy(dst.ptr_at(1), &src_rows[vgetq_lane_u32(indices, 1)], kBytes); | |
| 384 | 1616 | memcpy(dst.ptr_at(2), &src_rows[vgetq_lane_u32(indices, 2)], kBytes); | |
| 385 | 1616 | memcpy(dst.ptr_at(3), &src_rows[vgetq_lane_u32(indices, 3)], kBytes); | |
| 386 | 1616 | } | |
| 387 | 198952 | } | |
| 388 | 200176 | } | |
| 389 | |||
| 390 | template <size_t Lane, typename ScalarType> | ||
| 391 | 382248 | static const ScalarType* get_src_or_border_small( | |
| 392 | uint32x4_t in_range, Rows<const ScalarType> src_rows, uint32x4_t indices, | ||
| 393 | const ScalarType* border_values) { | ||
| 394 |
16/16✓ Branch 0 taken 74717 times.
✓ Branch 1 taken 19889 times.
✓ Branch 2 taken 74737 times.
✓ Branch 3 taken 19869 times.
✓ Branch 4 taken 74677 times.
✓ Branch 5 taken 19929 times.
✓ Branch 6 taken 74608 times.
✓ Branch 7 taken 19998 times.
✓ Branch 8 taken 446 times.
✓ Branch 9 taken 510 times.
✓ Branch 10 taken 442 times.
✓ Branch 11 taken 514 times.
✓ Branch 12 taken 474 times.
✓ Branch 13 taken 482 times.
✓ Branch 14 taken 412 times.
✓ Branch 15 taken 544 times.
|
382248 | return vgetq_lane_u32(in_range, Lane) |
| 395 | 300513 | ? &src_rows[vgetq_lane_u32(indices, Lane)] | |
| 396 | 81735 | : border_values; | |
| 397 | } | ||
| 398 | |||
| 399 | template <size_t Lane, typename ScalarType> | ||
| 400 | 3888 | static const ScalarType* get_src_or_border_large( | |
| 401 | uint32x4_t in_range, Rows<const ScalarType> src_rows, uint64x2_t indices, | ||
| 402 | const ScalarType* border_values) { | ||
| 403 |
16/16✓ Branch 0 taken 200 times.
✓ Branch 1 taken 532 times.
✓ Branch 2 taken 190 times.
✓ Branch 3 taken 542 times.
✓ Branch 4 taken 164 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 182 times.
✓ Branch 7 taken 550 times.
✓ Branch 8 taken 72 times.
✓ Branch 9 taken 168 times.
✓ Branch 10 taken 70 times.
✓ Branch 11 taken 170 times.
✓ Branch 12 taken 60 times.
✓ Branch 13 taken 180 times.
✓ Branch 14 taken 64 times.
✓ Branch 15 taken 176 times.
|
3888 | return vgetq_lane_u32(in_range, Lane) |
| 404 | 1002 | ? &src_rows[vgetq_lane_u64(indices, Lane % 2)] | |
| 405 | 2886 | : border_values; | |
| 406 | } | ||
| 407 | |||
| 408 | template <typename ScalarType, bool IsLarge, size_t Channels> | ||
| 409 | 96534 | void transform_pixels_constant(float32x4_t xf, float32x4_t yf, | |
| 410 | uint32x4_t v_xmax, uint32x4_t v_ymax, | ||
| 411 | uint32x4_t v_src_element_stride, | ||
| 412 | Rows<const ScalarType> src_rows, | ||
| 413 | Columns<ScalarType> dst, | ||
| 414 | const ScalarType* border_values) { | ||
| 415 | // Convert coordinates to integers. | ||
| 416 | // Negative numbers will become large positive numbers. | ||
| 417 | // Since the source width and height is known to be <=2^24 these large | ||
| 418 | // positive numbers will always be treated as outside the source image | ||
| 419 | // bounds. | ||
| 420 | 96534 | uint32x4_t x = vreinterpretq_u32_s32(vcvtaq_s32_f32(xf)); | |
| 421 | 96534 | uint32x4_t y = vreinterpretq_u32_s32(vcvtaq_s32_f32(yf)); | |
| 422 | 96534 | uint32x4_t in_range = vandq_u32(vcleq_u32(x, v_xmax), vcleq_u32(y, v_ymax)); | |
| 423 | |||
| 424 | // Copy pixels from source | ||
| 425 | if constexpr (Channels == 1) { | ||
| 426 | if constexpr (IsLarge) { | ||
| 427 | 1224 | uint64x2_t indices_low = | |
| 428 | 1224 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 429 | 612 | vget_low_u32(v_src_element_stride)); | |
| 430 | 1224 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 431 | 612 | vget_low_u32(v_src_element_stride)); | |
| 432 | 1224 | dst[0] = *get_src_or_border_large<0>(in_range, src_rows, indices_low, | |
| 433 | 612 | border_values); | |
| 434 | 1224 | dst[1] = *get_src_or_border_large<1>(in_range, src_rows, indices_low, | |
| 435 | 612 | border_values); | |
| 436 | 1224 | dst[2] = *get_src_or_border_large<2>(in_range, src_rows, indices_high, | |
| 437 | 612 | border_values); | |
| 438 | 1224 | dst[3] = *get_src_or_border_large<3>(in_range, src_rows, indices_high, | |
| 439 | 612 | border_values); | |
| 440 | 612 | } else { | |
| 441 | 93946 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
| 442 | 187892 | dst[0] = *get_src_or_border_small<0>(in_range, src_rows, indices, | |
| 443 | 93946 | border_values); | |
| 444 | 187892 | dst[1] = *get_src_or_border_small<1>(in_range, src_rows, indices, | |
| 445 | 93946 | border_values); | |
| 446 | 187892 | dst[2] = *get_src_or_border_small<2>(in_range, src_rows, indices, | |
| 447 | 93946 | border_values); | |
| 448 | 187892 | dst[3] = *get_src_or_border_small<3>(in_range, src_rows, indices, | |
| 449 | 93946 | border_values); | |
| 450 | 93946 | } | |
| 451 | } else { // Channels > 1 | ||
| 452 | 1976 | const size_t kBytes = Channels * sizeof(ScalarType); | |
| 453 | 1976 | const ScalarType *pixel0{}, *pixel1{}, *pixel2{}, *pixel3{}; | |
| 454 | // Multiply x with the number of channels | ||
| 455 | if constexpr (Channels == 2) { | ||
| 456 | 1976 | x = vshlq_n_u32(x, 1); | |
| 457 | } else { | ||
| 458 | x = vmulq_n_u32(x, Channels); | ||
| 459 | } | ||
| 460 | if constexpr (IsLarge) { | ||
| 461 | 720 | uint64x2_t indices_low = | |
| 462 | 720 | vmlal_u32(vmovl_u32(vget_low_u32(x)), vget_low_u32(y), | |
| 463 | 360 | vget_low_u32(v_src_element_stride)); | |
| 464 | 720 | uint64x2_t indices_high = vmlal_u32(vmovl_high_u32(x), vget_high_u32(y), | |
| 465 | 360 | vget_low_u32(v_src_element_stride)); | |
| 466 | 720 | pixel0 = get_src_or_border_large<0>(in_range, src_rows, indices_low, | |
| 467 | 360 | border_values); | |
| 468 | 720 | pixel1 = get_src_or_border_large<1>(in_range, src_rows, indices_low, | |
| 469 | 360 | border_values); | |
| 470 | 720 | pixel2 = get_src_or_border_large<2>(in_range, src_rows, indices_high, | |
| 471 | 360 | border_values); | |
| 472 | 720 | pixel3 = get_src_or_border_large<3>(in_range, src_rows, indices_high, | |
| 473 | 360 | border_values); | |
| 474 | |||
| 475 | 360 | } else { | |
| 476 | 1616 | uint32x4_t indices = vmlaq_u32(x, y, v_src_element_stride); | |
| 477 | 3232 | pixel0 = get_src_or_border_small<0>(in_range, src_rows, indices, | |
| 478 | 1616 | border_values); | |
| 479 | 3232 | pixel1 = get_src_or_border_small<1>(in_range, src_rows, indices, | |
| 480 | 1616 | border_values); | |
| 481 | 3232 | pixel2 = get_src_or_border_small<2>(in_range, src_rows, indices, | |
| 482 | 1616 | border_values); | |
| 483 | 3232 | pixel3 = get_src_or_border_small<3>(in_range, src_rows, indices, | |
| 484 | 1616 | border_values); | |
| 485 | 1616 | } | |
| 486 | 1976 | memcpy(dst.ptr_at(0), pixel0, kBytes); | |
| 487 | 1976 | memcpy(dst.ptr_at(1), pixel1, kBytes); | |
| 488 | 1976 | memcpy(dst.ptr_at(2), pixel2, kBytes); | |
| 489 | 1976 | memcpy(dst.ptr_at(3), pixel3, kBytes); | |
| 490 | 1976 | } | |
| 491 | 96534 | } | |
| 492 | |||
| 493 | template <typename ScalarType, bool IsLarge, size_t Channels, | ||
| 494 | kleidicv_border_type_t Border> | ||
| 495 | 7184 | void transform_pixels(float32x4_t xf, float32x4_t yf, uint32x4_t v_xmax, | |
| 496 | uint32x4_t v_ymax, uint32x4_t v_src_element_stride, | ||
| 497 | Rows<const ScalarType> src_rows, Columns<ScalarType> dst, | ||
| 498 | [[maybe_unused]] const ScalarType* border_values) { | ||
| 499 | if constexpr (Border == KLEIDICV_BORDER_TYPE_REPLICATE) { | ||
| 500 | 3592 | transform_pixels_replicate<ScalarType, IsLarge, Channels>( | |
| 501 | 3592 | xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst); | |
| 502 | } else { | ||
| 503 | static_assert(Border == KLEIDICV_BORDER_TYPE_CONSTANT); | ||
| 504 | 3592 | transform_pixels_constant<ScalarType, IsLarge, Channels>( | |
| 505 | 3592 | xf, yf, v_xmax, v_ymax, v_src_element_stride, src_rows, dst, | |
| 506 | 3592 | border_values); | |
| 507 | } | ||
| 508 | 7184 | } | |
| 509 | |||
| 510 | } // namespace kleidicv::neon | ||
| 511 |