| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
| 2 | // | ||
| 3 | // SPDX-License-Identifier: Apache-2.0 | ||
| 4 | |||
| 5 | #include <cassert> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstddef> | ||
| 8 | #include <cstdint> | ||
| 9 | |||
| 10 | #include "kleidicv/sve2.h" | ||
| 11 | #include "kleidicv/traits.h" | ||
| 12 | #include "kleidicv/transform/remap.h" | ||
| 13 | #include "transform_sve2.h" | ||
| 14 | |||
| 15 | namespace kleidicv::sve2 { | ||
| 16 | |||
| 17 | template <typename ScalarType> | ||
| 18 | inline svuint16_t interpolate_16point5(svbool_t pg, svuint16_t frac, | ||
| 19 | svuint16_t src_a, svuint16_t src_b, | ||
| 20 | svuint16_t src_c, svuint16_t src_d, | ||
| 21 | svuint32_t bias); | ||
| 22 | |||
| 23 | template <> | ||
| 24 | 10546 | inline svuint16_t interpolate_16point5<uint8_t>( | |
| 25 | svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, | ||
| 26 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 27 | 10546 | svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 28 | 21092 | svuint16_t yfrac = | |
| 29 | 21092 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 30 | 10546 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 31 | 21092 | svuint16_t nxfrac = | |
| 32 | 10546 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
| 33 | 21092 | svuint16_t nyfrac = | |
| 34 | 10546 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
| 35 | 10546 | svuint16_t line0 = svmla_x(pg, svmul_x(pg, xfrac, src_b), nxfrac, src_a); | |
| 36 | 10546 | svuint16_t line1 = svmla_x(pg, svmul_x(pg, xfrac, src_d), nxfrac, src_c); | |
| 37 | |||
| 38 | 10546 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
| 39 | 10546 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
| 40 | 10546 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
| 41 | 10546 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
| 42 | |||
| 43 | 21092 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 44 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 45 | 10546 | } | |
| 46 | |||
| 47 | template <> | ||
| 48 | 9946 | inline svuint16_t interpolate_16point5<uint16_t>( | |
| 49 | svbool_t pg, svuint16_t frac, svuint16_t src_a, svuint16_t src_b, | ||
| 50 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 51 | 9946 | svuint16_t xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 52 | 19892 | svuint16_t yfrac = | |
| 53 | 19892 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 54 | 9946 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 55 | 19892 | svuint16_t nxfrac = | |
| 56 | 9946 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
| 57 | 19892 | svuint16_t nyfrac = | |
| 58 | 9946 | svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
| 59 | 19892 | svuint32_t line0_b = svmla_x(pg, svmullb(xfrac, src_b), svmovlb_u32(nxfrac), | |
| 60 | 9946 | svmovlb_u32(src_a)); | |
| 61 | 19892 | svuint32_t line0_t = svmla_x(pg, svmullt(xfrac, src_b), svmovlt_u32(nxfrac), | |
| 62 | 9946 | svmovlt_u32(src_a)); | |
| 63 | 19892 | svuint32_t line1_b = svmla_x(pg, svmullb(xfrac, src_d), svmovlb_u32(nxfrac), | |
| 64 | 9946 | svmovlb_u32(src_c)); | |
| 65 | 19892 | svuint32_t line1_t = svmla_x(pg, svmullt(xfrac, src_d), svmovlt_u32(nxfrac), | |
| 66 | 9946 | svmovlt_u32(src_c)); | |
| 67 | |||
| 68 | 19892 | svuint32_t acc_b = | |
| 69 | 19892 | svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_b, svmovlb_u32(nyfrac)), | |
| 70 | 9946 | line1_b, svmovlb_u32(yfrac)); | |
| 71 | 19892 | svuint32_t acc_t = | |
| 72 | 19892 | svmla_u32_x(pg, svmla_u32_x(pg, bias, line0_t, svmovlt_u32(nyfrac)), | |
| 73 | 9946 | line1_t, svmovlt_u32(yfrac)); | |
| 74 | |||
| 75 | 19892 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 76 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 77 | 9946 | } | |
| 78 | |||
| 79 | template <typename ScalarType> | ||
| 80 | class RemapS16Point5Replicate; | ||
| 81 | |||
| 82 | template <> | ||
| 83 | class RemapS16Point5Replicate<uint8_t> { | ||
| 84 | public: | ||
| 85 | using ScalarType = uint8_t; | ||
| 86 | using MapVecTraits = VecTraits<int16_t>; | ||
| 87 | using MapVectorType = typename MapVecTraits::VectorType; | ||
| 88 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
| 89 | using FracVecTraits = VecTraits<uint16_t>; | ||
| 90 | using FracVectorType = typename FracVecTraits::VectorType; | ||
| 91 | |||
| 92 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
| 93 | size_t src_height, svuint16_t& v_src_stride, | ||
| 94 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
| 95 | 134 | : src_rows_{src_rows}, | |
| 96 | 134 | v_src_stride_{v_src_stride}, | |
| 97 | 134 | v_xmax_{v_x_max}, | |
| 98 | 134 | v_ymax_{v_y_max} { | |
| 99 | 134 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
| 100 | 134 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
| 101 | 134 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
| 102 | 134 | } | |
| 103 | |||
| 104 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 105 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 106 | 158 | svuint16_t src_a, src_b, src_c, src_d; | |
| 107 | |||
| 108 | 158 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 109 | 5432 | auto vector_path = [&](svbool_t pg, ptrdiff_t step) { | |
| 110 | 5274 | load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); | |
| 111 | 10548 | interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d, | |
| 112 | 5274 | bias); | |
| 113 | 5274 | }; | |
| 114 | |||
| 115 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 116 | 5358 | loop.unroll_once([&](size_t step) { | |
| 117 | 5200 | svbool_t pg = MapVecTraits::svptrue(); | |
| 118 | 5200 | vector_path(pg, static_cast<ptrdiff_t>(step)); | |
| 119 | 5200 | }); | |
| 120 | 232 | loop.remaining([&](size_t length, size_t step) { | |
| 121 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
| 122 | 74 | vector_path(pg, static_cast<ptrdiff_t>(length)); | |
| 123 | 74 | }); | |
| 124 | 158 | } | |
| 125 | |||
| 126 | protected: | ||
| 127 | 21096 | svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t, | |
| 128 | svuint32_t offsets_t) { | ||
| 129 | 42192 | svuint32_t src_b = | |
| 130 | 21096 | svldnt1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
| 131 | 42192 | svuint32_t src_t = | |
| 132 | 21096 | svldnt1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
| 133 | 63288 | return svtrn1_u16(svreinterpret_u16_u32(src_b), | |
| 134 | 21096 | svreinterpret_u16_u32(src_t)); | |
| 135 | 21096 | } | |
| 136 | |||
| 137 | 5274 | void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy, | |
| 138 | svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c, | ||
| 139 | svuint16_t& src_d) { | ||
| 140 | 5274 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
| 141 | |||
| 142 | // Clamp coordinates to within the dimensions of the source image | ||
| 143 | 10548 | svuint16_t x0 = svreinterpret_u16_s16( | |
| 144 | 5274 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
| 145 | 10548 | svuint16_t y0 = svreinterpret_u16_s16( | |
| 146 | 5274 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
| 147 | |||
| 148 | // x1 = x0 + 1, and clamp it too | ||
| 149 | 10548 | svuint16_t x1 = svreinterpret_u16_s16( | |
| 150 | 10548 | svmax_x(pg, svdup_n_s16(0), | |
| 151 | 5274 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
| 152 | |||
| 153 | 10548 | svuint16_t y1 = svreinterpret_u16_s16( | |
| 154 | 10548 | svmax_x(pg, svdup_n_s16(0), | |
| 155 | 5274 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
| 156 | 5274 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
| 157 | 5274 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
| 158 | |||
| 159 | // Calculate offsets from coordinates (y * stride + x) | ||
| 160 | 5274 | svuint32_t offsets_a_b = svmlalb_u32(svmovlb_u32(x0), y0, v_src_stride_); | |
| 161 | 5274 | svuint32_t offsets_a_t = svmlalt_u32(svmovlt_u32(x0), y0, v_src_stride_); | |
| 162 | 5274 | svuint32_t offsets_b_b = svmlalb_u32(svmovlb_u32(x1), y0, v_src_stride_); | |
| 163 | 5274 | svuint32_t offsets_b_t = svmlalt_u32(svmovlt_u32(x1), y0, v_src_stride_); | |
| 164 | 5274 | svuint32_t offsets_c_b = svmlalb_u32(svmovlb_u32(x0), y1, v_src_stride_); | |
| 165 | 5274 | svuint32_t offsets_c_t = svmlalt_u32(svmovlt_u32(x0), y1, v_src_stride_); | |
| 166 | 5274 | svuint32_t offsets_d_b = svmlalb_u32(svmovlb_u32(x1), y1, v_src_stride_); | |
| 167 | 5274 | svuint32_t offsets_d_t = svmlalt_u32(svmovlt_u32(x1), y1, v_src_stride_); | |
| 168 | |||
| 169 | // Load pixels from source | ||
| 170 | 5274 | src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t); | |
| 171 | 5274 | src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t); | |
| 172 | 5274 | src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t); | |
| 173 | 5274 | src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t); | |
| 174 | 5274 | mapxy += step; | |
| 175 | 5274 | } | |
| 176 | |||
| 177 | 5274 | void interpolate_and_store(svbool_t pg, ptrdiff_t step, | |
| 178 | Columns<const uint16_t>& mapfrac, | ||
| 179 | Columns<ScalarType>& dst, svuint16_t src_a, | ||
| 180 | svuint16_t src_b, svuint16_t src_c, | ||
| 181 | svuint16_t src_d, svuint32_t bias) { | ||
| 182 | 5274 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
| 183 | 10548 | svuint16_t result = interpolate_16point5<uint8_t>(pg, frac, src_a, src_b, | |
| 184 | 5274 | src_c, src_d, bias); | |
| 185 | 5274 | svst1b_u16(pg, &dst[0], result); | |
| 186 | 5274 | mapfrac += step; | |
| 187 | 5274 | dst += step; | |
| 188 | 5274 | } | |
| 189 | |||
| 190 | Rows<const ScalarType> src_rows_; | ||
| 191 | |||
| 192 | private: | ||
| 193 | svuint16_t& v_src_stride_; | ||
| 194 | MapVectorType& v_xmax_; | ||
| 195 | MapVectorType& v_ymax_; | ||
| 196 | }; // end of class RemapS16Point5Replicate<uint8_t> | ||
| 197 | |||
| 198 | template <> | ||
| 199 | class RemapS16Point5Replicate<uint16_t> { | ||
| 200 | public: | ||
| 201 | using ScalarType = uint16_t; | ||
| 202 | using MapVecTraits = VecTraits<int16_t>; | ||
| 203 | using MapVectorType = typename MapVecTraits::VectorType; | ||
| 204 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
| 205 | using FracVecTraits = VecTraits<uint16_t>; | ||
| 206 | using FracVectorType = typename FracVecTraits::VectorType; | ||
| 207 | |||
| 208 | 134 | RemapS16Point5Replicate(Rows<const ScalarType> src_rows, size_t src_width, | |
| 209 | size_t src_height, svuint16_t& v_src_stride, | ||
| 210 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
| 211 | 134 | : src_rows_{src_rows}, | |
| 212 | 134 | v_src_element_stride_{v_src_stride}, | |
| 213 | 134 | v_xmax_{v_x_max}, | |
| 214 | 134 | v_ymax_{v_y_max} { | |
| 215 | 134 | v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); | |
| 216 | 134 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
| 217 | 134 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
| 218 | 134 | } | |
| 219 | |||
| 220 | 158 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 221 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 222 | 158 | svuint16_t src_a, src_b, src_c, src_d; | |
| 223 | |||
| 224 | 158 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 225 | 5132 | auto vector_path = [&](svbool_t pg, ptrdiff_t step) { | |
| 226 | 4974 | load_source(pg, step, mapxy, src_a, src_b, src_c, src_d); | |
| 227 | 9948 | interpolate_and_store(pg, step, mapfrac, dst, src_a, src_b, src_c, src_d, | |
| 228 | 4974 | bias); | |
| 229 | 4974 | }; | |
| 230 | |||
| 231 | 158 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 232 | 5058 | loop.unroll_once([&](size_t step) { | |
| 233 | 4900 | svbool_t pg = MapVecTraits::svptrue(); | |
| 234 | 4900 | vector_path(pg, static_cast<ptrdiff_t>(step)); | |
| 235 | 4900 | }); | |
| 236 | 232 | loop.remaining([&](size_t length, size_t step) { | |
| 237 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
| 238 | 74 | vector_path(pg, static_cast<ptrdiff_t>(length)); | |
| 239 | 74 | }); | |
| 240 | 158 | } | |
| 241 | |||
| 242 | protected: | ||
| 243 | 19896 | svuint16_t gather_load_src(svbool_t pg_b, svuint32_t offsets_b, svbool_t pg_t, | |
| 244 | svuint32_t offsets_t) { | ||
| 245 | // Account for the size of the source type when calculating offset | ||
| 246 | 19896 | offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); | |
| 247 | 19896 | offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); | |
| 248 | |||
| 249 | 39792 | svuint32_t src_b = | |
| 250 | 19896 | svldnt1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
| 251 | 39792 | svuint32_t src_t = | |
| 252 | 19896 | svldnt1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
| 253 | 59688 | return svtrn1_u16(svreinterpret_u16_u32(src_b), | |
| 254 | 19896 | svreinterpret_u16_u32(src_t)); | |
| 255 | 19896 | } | |
| 256 | |||
| 257 | 4974 | void load_source(svbool_t pg, ptrdiff_t step, Columns<const int16_t>& mapxy, | |
| 258 | svuint16_t& src_a, svuint16_t& src_b, svuint16_t& src_c, | ||
| 259 | svuint16_t& src_d) { | ||
| 260 | 4974 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
| 261 | |||
| 262 | // Clamp coordinates to within the dimensions of the source image | ||
| 263 | 9948 | svuint16_t x0 = svreinterpret_u16_s16( | |
| 264 | 4974 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
| 265 | 9948 | svuint16_t y0 = svreinterpret_u16_s16( | |
| 266 | 4974 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
| 267 | |||
| 268 | // x1 = x0 + 1, and clamp it too | ||
| 269 | 9948 | svuint16_t x1 = svreinterpret_u16_s16( | |
| 270 | 9948 | svmax_x(pg, svdup_n_s16(0), | |
| 271 | 4974 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
| 272 | |||
| 273 | 9948 | svuint16_t y1 = svreinterpret_u16_s16( | |
| 274 | 9948 | svmax_x(pg, svdup_n_s16(0), | |
| 275 | 4974 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
| 276 | 4974 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
| 277 | 4974 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
| 278 | |||
| 279 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
| 280 | 9948 | svuint32_t offsets_a_b = | |
| 281 | 4974 | svmlalb_u32(svmovlb_u32(x0), y0, v_src_element_stride_); | |
| 282 | 9948 | svuint32_t offsets_a_t = | |
| 283 | 4974 | svmlalt_u32(svmovlt_u32(x0), y0, v_src_element_stride_); | |
| 284 | 9948 | svuint32_t offsets_b_b = | |
| 285 | 4974 | svmlalb_u32(svmovlb_u32(x1), y0, v_src_element_stride_); | |
| 286 | 9948 | svuint32_t offsets_b_t = | |
| 287 | 4974 | svmlalt_u32(svmovlt_u32(x1), y0, v_src_element_stride_); | |
| 288 | 9948 | svuint32_t offsets_c_b = | |
| 289 | 4974 | svmlalb_u32(svmovlb_u32(x0), y1, v_src_element_stride_); | |
| 290 | 9948 | svuint32_t offsets_c_t = | |
| 291 | 4974 | svmlalt_u32(svmovlt_u32(x0), y1, v_src_element_stride_); | |
| 292 | 9948 | svuint32_t offsets_d_b = | |
| 293 | 4974 | svmlalb_u32(svmovlb_u32(x1), y1, v_src_element_stride_); | |
| 294 | 9948 | svuint32_t offsets_d_t = | |
| 295 | 4974 | svmlalt_u32(svmovlt_u32(x1), y1, v_src_element_stride_); | |
| 296 | |||
| 297 | // Load pixels from source | ||
| 298 | 4974 | src_a = gather_load_src(pg_b, offsets_a_b, pg_t, offsets_a_t); | |
| 299 | 4974 | src_b = gather_load_src(pg_b, offsets_b_b, pg_t, offsets_b_t); | |
| 300 | 4974 | src_c = gather_load_src(pg_b, offsets_c_b, pg_t, offsets_c_t); | |
| 301 | 4974 | src_d = gather_load_src(pg_b, offsets_d_b, pg_t, offsets_d_t); | |
| 302 | 4974 | mapxy += step; | |
| 303 | 4974 | } | |
| 304 | |||
| 305 | 4974 | void interpolate_and_store(svbool_t pg, ptrdiff_t step, | |
| 306 | Columns<const uint16_t>& mapfrac, | ||
| 307 | Columns<ScalarType>& dst, svuint16_t src_a, | ||
| 308 | svuint16_t src_b, svuint16_t src_c, | ||
| 309 | svuint16_t src_d, svuint32_t bias) { | ||
| 310 | 4974 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
| 311 | 9948 | svuint16_t result = interpolate_16point5<uint16_t>(pg, frac, src_a, src_b, | |
| 312 | 4974 | src_c, src_d, bias); | |
| 313 | 4974 | svst1_u16(pg, &dst[0], result); | |
| 314 | 4974 | mapfrac += step; | |
| 315 | 4974 | dst += step; | |
| 316 | 4974 | } | |
| 317 | |||
| 318 | Rows<const ScalarType> src_rows_; | ||
| 319 | |||
| 320 | private: | ||
| 321 | svuint16_t& v_src_element_stride_; | ||
| 322 | MapVectorType& v_xmax_; | ||
| 323 | MapVectorType& v_ymax_; | ||
| 324 | }; // end of class RemapS16Point5Replicate<uint16_t> | ||
| 325 | |||
| 326 | template <typename ScalarType> | ||
| 327 | class RemapS16Point5ConstantBorder; | ||
| 328 | |||
| 329 | template <> | ||
| 330 | class RemapS16Point5ConstantBorder<uint8_t> { | ||
| 331 | public: | ||
| 332 | using ScalarType = uint8_t; | ||
| 333 | |||
| 334 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
| 335 | size_t src_width, size_t src_height, | ||
| 336 | const ScalarType* border_value, | ||
| 337 | svuint16_t& v_src_stride, svuint16_t& v_width, | ||
| 338 | svuint16_t& v_height, svuint16_t& v_border) | ||
| 339 | 132 | : src_rows_{src_rows}, | |
| 340 | 132 | v_src_stride_{v_src_stride}, | |
| 341 | 132 | v_width_{v_width}, | |
| 342 | 132 | v_height_{v_height}, | |
| 343 | 132 | v_border_{v_border} { | |
| 344 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
| 345 | 132 | v_width_ = svdup_u16(static_cast<uint16_t>(src_width)); | |
| 346 | 132 | v_height_ = svdup_u16(static_cast<uint16_t>(src_height)); | |
| 347 | 132 | v_border_ = svdup_u16(*border_value); | |
| 348 | 132 | } | |
| 349 | |||
| 350 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 351 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 352 | 156 | svuint16_t one = svdup_n_u16(1); | |
| 353 | 156 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 354 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 5272 times.
|
5428 | for (size_t i = 0; i < width; i += svcnth()) { |
| 355 | 5272 | svbool_t pg = svwhilelt_b16_u64(i, width); | |
| 356 | |||
| 357 | 10544 | svuint16x2_t xy = | |
| 358 | 10544 | svld2_u16(pg, reinterpret_cast<const uint16_t*>( | |
| 359 | 5272 | &mapxy[static_cast<ptrdiff_t>(i * 2)])); | |
| 360 | |||
| 361 | 5272 | svuint16_t x0 = svget2(xy, 0); | |
| 362 | 5272 | svuint16_t y0 = svget2(xy, 1); | |
| 363 | 5272 | svuint16_t x1 = svadd_x(pg, x0, one); | |
| 364 | 5272 | svuint16_t y1 = svadd_x(pg, y0, one); | |
| 365 | |||
| 366 | 10544 | svuint16_t v00 = load_pixels_or_constant_border( | |
| 367 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y0); | |
| 368 | 10544 | svuint16_t v01 = load_pixels_or_constant_border( | |
| 369 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x0, y1); | |
| 370 | 10544 | svuint16_t v10 = load_pixels_or_constant_border( | |
| 371 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y0); | |
| 372 | 10544 | svuint16_t v11 = load_pixels_or_constant_border( | |
| 373 | 5272 | src_rows_, v_src_stride_, v_width_, v_height_, v_border_, pg, x1, y1); | |
| 374 | |||
| 375 | 5272 | svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]); | |
| 376 | 10544 | svuint16_t result = | |
| 377 | 5272 | interpolate_16point5<uint8_t>(pg, frac, v00, v10, v01, v11, bias); | |
| 378 | |||
| 379 | 5272 | svst1b_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result); | |
| 380 | 5272 | } | |
| 381 | 156 | } | |
| 382 | |||
| 383 | private: | ||
| 384 | 21088 | svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_, | |
| 385 | svuint16_t& v_src_stride_, | ||
| 386 | svuint16_t& v_width_, | ||
| 387 | svuint16_t& v_height_, | ||
| 388 | svuint16_t& v_border_, svbool_t pg, | ||
| 389 | svuint16_t x, svuint16_t y) { | ||
| 390 | // Find whether coordinates are within the image dimensions. | ||
| 391 | 42176 | svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), | |
| 392 | 21088 | svcmplt_u16(pg, y, v_height_)); | |
| 393 | |||
| 394 | // Calculate offsets from coordinates (y * stride + x) | ||
| 395 | 21088 | svuint32_t offsets_b = svmlalb_u32(svmovlb_u32(x), y, v_src_stride_); | |
| 396 | 21088 | svuint32_t offsets_t = svmlalt_u32(svmovlt_u32(x), y, v_src_stride_); | |
| 397 | |||
| 398 | 21088 | svbool_t pg_b = in_range; | |
| 399 | 21088 | svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); | |
| 400 | |||
| 401 | // Copy pixels from source | ||
| 402 | 42176 | svuint32_t result_b = | |
| 403 | 21088 | svld1ub_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
| 404 | 42176 | svuint32_t result_t = | |
| 405 | 21088 | svld1ub_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
| 406 | |||
| 407 | 42176 | svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), | |
| 408 | 21088 | svreinterpret_u16_u32(result_t)); | |
| 409 | |||
| 410 | 42176 | return svsel(in_range, result, v_border_); | |
| 411 | 21088 | } | |
| 412 | |||
| 413 | Rows<const ScalarType> src_rows_; | ||
| 414 | svuint16_t& v_src_stride_; | ||
| 415 | svuint16_t& v_width_; | ||
| 416 | svuint16_t& v_height_; | ||
| 417 | svuint16_t& v_border_; | ||
| 418 | }; // end of class RemapS16Point5ConstantBorder<uint8_t> | ||
| 419 | |||
| 420 | template <> | ||
| 421 | class RemapS16Point5ConstantBorder<uint16_t> { | ||
| 422 | public: | ||
| 423 | using ScalarType = uint16_t; | ||
| 424 | |||
| 425 | 132 | RemapS16Point5ConstantBorder(Rows<const ScalarType> src_rows, | |
| 426 | size_t src_width, size_t src_height, | ||
| 427 | const ScalarType* border_value, | ||
| 428 | svuint16_t& v_src_stride, svuint16_t& v_width, | ||
| 429 | svuint16_t& v_height, svuint16_t& v_border) | ||
| 430 | 132 | : src_rows_{src_rows}, | |
| 431 | 132 | v_src_element_stride_{v_src_stride}, | |
| 432 | 132 | v_width_{v_width}, | |
| 433 | 132 | v_height_{v_height}, | |
| 434 | 132 | v_border_{v_border} { | |
| 435 | 132 | v_src_element_stride_ = svdup_u16(src_rows.stride() / sizeof(ScalarType)); | |
| 436 | 132 | v_width_ = svdup_u16(static_cast<uint16_t>(src_width)); | |
| 437 | 132 | v_height_ = svdup_u16(static_cast<uint16_t>(src_height)); | |
| 438 | 132 | v_border_ = svdup_u16(*border_value); | |
| 439 | 132 | } | |
| 440 | |||
| 441 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 442 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 443 | 156 | svuint16_t one = svdup_n_u16(1); | |
| 444 | 156 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 445 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 4972 times.
|
5128 | for (size_t i = 0; i < width; i += svcnth()) { |
| 446 | 4972 | svbool_t pg = svwhilelt_b16_u64(i, width); | |
| 447 | |||
| 448 | 9944 | svuint16x2_t xy = | |
| 449 | 9944 | svld2_u16(pg, reinterpret_cast<const uint16_t*>( | |
| 450 | 4972 | &mapxy[static_cast<ptrdiff_t>(i * 2)])); | |
| 451 | |||
| 452 | 4972 | svuint16_t x0 = svget2(xy, 0); | |
| 453 | 4972 | svuint16_t y0 = svget2(xy, 1); | |
| 454 | 4972 | svuint16_t x1 = svadd_x(pg, x0, one); | |
| 455 | 4972 | svuint16_t y1 = svadd_x(pg, y0, one); | |
| 456 | |||
| 457 | 9944 | svuint16_t v00 = load_pixels_or_constant_border( | |
| 458 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
| 459 | 4972 | x0, y0); | |
| 460 | 9944 | svuint16_t v01 = load_pixels_or_constant_border( | |
| 461 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
| 462 | 4972 | x0, y1); | |
| 463 | 9944 | svuint16_t v10 = load_pixels_or_constant_border( | |
| 464 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
| 465 | 4972 | x1, y0); | |
| 466 | 9944 | svuint16_t v11 = load_pixels_or_constant_border( | |
| 467 | 4972 | src_rows_, v_src_element_stride_, v_width_, v_height_, v_border_, pg, | |
| 468 | 4972 | x1, y1); | |
| 469 | |||
| 470 | 4972 | svuint16_t frac = svld1_u16(pg, &mapfrac[static_cast<ptrdiff_t>(i)]); | |
| 471 | 9944 | svuint16_t result = | |
| 472 | 4972 | interpolate_16point5<uint16_t>(pg, frac, v00, v10, v01, v11, bias); | |
| 473 | |||
| 474 | 4972 | svst1_u16(pg, &dst[static_cast<ptrdiff_t>(i)], result); | |
| 475 | 4972 | } | |
| 476 | 156 | } | |
| 477 | |||
| 478 | private: | ||
| 479 | 19888 | svuint16_t load_pixels_or_constant_border(Rows<const ScalarType> src_rows_, | |
| 480 | svuint16_t& v_src_element_stride_, | ||
| 481 | svuint16_t& v_width_, | ||
| 482 | svuint16_t& v_height_, | ||
| 483 | svuint16_t& v_border_, svbool_t pg, | ||
| 484 | svuint16_t x, svuint16_t y) { | ||
| 485 | // Find whether coordinates are within the image dimensions. | ||
| 486 | 39776 | svbool_t in_range = svand_b_z(pg, svcmplt_u16(pg, x, v_width_), | |
| 487 | 19888 | svcmplt_u16(pg, y, v_height_)); | |
| 488 | |||
| 489 | // Calculate offsets from coordinates (y * stride/sizeof(ScalarType) + x) | ||
| 490 | 39776 | svuint32_t offsets_b = | |
| 491 | 19888 | svmlalb_u32(svmovlb_u32(x), y, v_src_element_stride_); | |
| 492 | 39776 | svuint32_t offsets_t = | |
| 493 | 19888 | svmlalt_u32(svmovlt_u32(x), y, v_src_element_stride_); | |
| 494 | |||
| 495 | 19888 | svbool_t pg_b = in_range; | |
| 496 | 19888 | svbool_t pg_t = svtrn2_b16(in_range, svpfalse()); | |
| 497 | |||
| 498 | // Account for the size of the source type when calculating offset | ||
| 499 | 19888 | offsets_b = svlsl_n_u32_x(pg_b, offsets_b, 1); | |
| 500 | 19888 | offsets_t = svlsl_n_u32_x(pg_t, offsets_t, 1); | |
| 501 | |||
| 502 | // Copy pixels from source | ||
| 503 | 39776 | svuint32_t result_b = | |
| 504 | 19888 | svld1uh_gather_u32offset_u32(pg_b, &src_rows_[0], offsets_b); | |
| 505 | 39776 | svuint32_t result_t = | |
| 506 | 19888 | svld1uh_gather_u32offset_u32(pg_t, &src_rows_[0], offsets_t); | |
| 507 | |||
| 508 | 39776 | svuint16_t result = svtrn1_u16(svreinterpret_u16_u32(result_b), | |
| 509 | 19888 | svreinterpret_u16_u32(result_t)); | |
| 510 | |||
| 511 | 39776 | return svsel(in_range, result, v_border_); | |
| 512 | 19888 | } | |
| 513 | |||
| 514 | Rows<const ScalarType> src_rows_; | ||
| 515 | svuint16_t& v_src_element_stride_; | ||
| 516 | svuint16_t& v_width_; | ||
| 517 | svuint16_t& v_height_; | ||
| 518 | svuint16_t& v_border_; | ||
| 519 | }; // end of class RemapS16Point5ConstantBorder<uint16_t> | ||
| 520 | |||
| 521 | template <typename ScalarType> | ||
| 522 | class RemapS16Point5Replicate4ch; | ||
| 523 | |||
| 524 | template <> | ||
| 525 | class RemapS16Point5Replicate4ch<uint8_t> { | ||
| 526 | public: | ||
| 527 | using ScalarType = uint8_t; | ||
| 528 | using MapVecTraits = VecTraits<int16_t>; | ||
| 529 | using MapVectorType = typename MapVecTraits::VectorType; | ||
| 530 | using MapVector2Type = typename MapVecTraits::Vector2Type; | ||
| 531 | using FracVecTraits = VecTraits<uint16_t>; | ||
| 532 | using FracVectorType = typename FracVecTraits::VectorType; | ||
| 533 | |||
| 534 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 535 | size_t src_height, svuint16_t& v_src_stride, | ||
| 536 | MapVectorType& v_x_max, MapVectorType& v_y_max) | ||
| 537 | 132 | : src_rows_{src_rows}, | |
| 538 | 132 | v_src_stride_{v_src_stride}, | |
| 539 | 132 | v_xmax_{v_x_max}, | |
| 540 | 132 | v_ymax_{v_y_max} { | |
| 541 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
| 542 | 132 | v_xmax_ = svdup_s16(static_cast<int16_t>(src_width - 1)); | |
| 543 | 132 | v_ymax_ = svdup_s16(static_cast<int16_t>(src_height - 1)); | |
| 544 | 132 | } | |
| 545 | |||
| 546 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 547 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 548 | 156 | LoopUnroll loop{width, MapVecTraits::num_lanes()}; | |
| 549 | 5354 | loop.unroll_once([&](size_t step) { | |
| 550 | 5198 | svbool_t pg = MapVecTraits::svptrue(); | |
| 551 | 5198 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step)); | |
| 552 | 5198 | }); | |
| 553 | 230 | loop.remaining([&](size_t length, size_t step) { | |
| 554 | 74 | svbool_t pg = MapVecTraits::svwhilelt(step - length, step); | |
| 555 | 74 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length)); | |
| 556 | 74 | }); | |
| 557 | 156 | } | |
| 558 | |||
| 559 | 5272 | void vector_path(svbool_t pg, Columns<const int16_t>& mapxy, | |
| 560 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
| 561 | ptrdiff_t step) { | ||
| 562 | 5272 | MapVector2Type xy = svld2_s16(pg, &mapxy[0]); | |
| 563 | 5272 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 564 | |||
| 565 | // Clamp coordinates to within the dimensions of the source image | ||
| 566 | 10544 | svuint16_t x0 = svreinterpret_u16_s16( | |
| 567 | 5272 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 0), v_xmax_))); | |
| 568 | 10544 | svuint16_t y0 = svreinterpret_u16_s16( | |
| 569 | 5272 | svmax_x(pg, svdup_n_s16(0), svmin_x(pg, svget2(xy, 1), v_ymax_))); | |
| 570 | |||
| 571 | // x1 = x0 + 1, and clamp it too | ||
| 572 | 10544 | svuint16_t x1 = svreinterpret_u16_s16( | |
| 573 | 10544 | svmax_x(pg, svdup_n_s16(0), | |
| 574 | 5272 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 0), 1), v_xmax_))); | |
| 575 | |||
| 576 | 10544 | svuint16_t y1 = svreinterpret_u16_s16( | |
| 577 | 10544 | svmax_x(pg, svdup_n_s16(0), | |
| 578 | 5272 | svmin_x(pg, svqadd_n_s16_x(pg, svget2(xy, 1), 1), v_ymax_))); | |
| 579 | 5272 | svbool_t pg_b = svwhilelt_b32(int64_t{0}, (step + 1) / 2); | |
| 580 | 5272 | svbool_t pg_t = svwhilelt_b32(int64_t{0}, step / 2); | |
| 581 | |||
| 582 | // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 | ||
| 583 | // channels | ||
| 584 | 26360 | auto load_4ch_b = [&](svuint16_t x, svuint16_t y) { | |
| 585 | 21088 | return svreinterpret_u8_u32(svld1_gather_u32offset_u32( | |
| 586 | 21088 | pg_b, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
| 587 | 21088 | svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_))); | |
| 588 | }; | ||
| 589 | 26360 | auto load_4ch_t = [&](svuint16_t x, svuint16_t y) { | |
| 590 | 21088 | return svreinterpret_u8_u32(svld1_gather_u32offset_u32( | |
| 591 | 21088 | pg_t, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
| 592 | 21088 | svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_))); | |
| 593 | }; | ||
| 594 | |||
| 595 | 5272 | FracVectorType frac = svld1_u16(pg, &mapfrac[0]); | |
| 596 | 10544 | svuint16_t xfrac = | |
| 597 | 5272 | svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 598 | 10544 | svuint16_t yfrac = | |
| 599 | 10544 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 600 | 5272 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 601 | |||
| 602 | 26360 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
| 603 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
| 604 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 605 | 42176 | svuint16_t line0 = svmla_x( | |
| 606 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); | |
| 607 | 42176 | svuint16_t line1 = svmla_x( | |
| 608 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); | |
| 609 | |||
| 610 | 21088 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
| 611 | 21088 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
| 612 | 21088 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
| 613 | 21088 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
| 614 | |||
| 615 | 42176 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 616 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 617 | 21088 | }; | |
| 618 | |||
| 619 | // bottom part | ||
| 620 | 5272 | svuint8_t a = load_4ch_b(x0, y0); | |
| 621 | 5272 | svuint8_t b = load_4ch_b(x1, y0); | |
| 622 | 5272 | svuint8_t c = load_4ch_b(x0, y1); | |
| 623 | 5272 | svuint8_t d = load_4ch_b(x1, y1); | |
| 624 | // from xfrac, we need the bottom part twice | ||
| 625 | 5272 | svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); | |
| 626 | 10544 | svuint16_t nxfrac2b = svsub_u16_x( | |
| 627 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); | |
| 628 | 5272 | svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); | |
| 629 | 10544 | svuint16_t nyfrac2b = svsub_u16_x( | |
| 630 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); | |
| 631 | |||
| 632 | // a,b,c,d looks like 12341234...(four channels) | ||
| 633 | // bottom is 1313... | ||
| 634 | 10544 | svuint16_t res_bb = | |
| 635 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), | |
| 636 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
| 637 | // top is 2424... | ||
| 638 | 10544 | svuint16_t res_bt = | |
| 639 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), | |
| 640 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
| 641 | 10544 | svuint8_t res_b = | |
| 642 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); | |
| 643 | |||
| 644 | // top part | ||
| 645 | 5272 | a = load_4ch_t(x0, y0); | |
| 646 | 5272 | b = load_4ch_t(x1, y0); | |
| 647 | 5272 | c = load_4ch_t(x0, y1); | |
| 648 | 5272 | d = load_4ch_t(x1, y1); | |
| 649 | // from xfrac, we need the top part twice | ||
| 650 | 5272 | svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); | |
| 651 | 10544 | svuint16_t nxfrac2t = svsub_u16_x( | |
| 652 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); | |
| 653 | 5272 | svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); | |
| 654 | 10544 | svuint16_t nyfrac2t = svsub_u16_x( | |
| 655 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); | |
| 656 | |||
| 657 | // a,b,c,d looks like 12341234...(four channels) | ||
| 658 | // bottom is 1313... | ||
| 659 | 10544 | svuint16_t res_tb = | |
| 660 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), | |
| 661 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
| 662 | // top is 2424... | ||
| 663 | 10544 | svuint16_t res_tt = | |
| 664 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), | |
| 665 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
| 666 | 10544 | svuint8_t res_t = | |
| 667 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); | |
| 668 | |||
| 669 | 5272 | svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step)); | |
| 670 | 5272 | svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step)); | |
| 671 | 10544 | svuint32_t res_low = | |
| 672 | 5272 | svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
| 673 | 10544 | svuint32_t res_high = | |
| 674 | 5272 | svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
| 675 | 5272 | mapxy += step; | |
| 676 | 5272 | svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low); | |
| 677 | 10544 | svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(), | |
| 678 | 5272 | res_high); | |
| 679 | 5272 | mapfrac += step; | |
| 680 | 5272 | dst += step; | |
| 681 | 5272 | } | |
| 682 | |||
| 683 | Rows<const ScalarType> src_rows_; | ||
| 684 | |||
| 685 | private: | ||
| 686 | svuint16_t& v_src_stride_; | ||
| 687 | MapVectorType& v_xmax_; | ||
| 688 | MapVectorType& v_ymax_; | ||
| 689 | }; // end of class RemapS16Point5Replicate4ch<uint8_t> | ||
| 690 | |||
| 691 | template <> | ||
| 692 | class RemapS16Point5Replicate4ch<uint16_t> { | ||
| 693 | public: | ||
| 694 | using ScalarType = uint16_t; | ||
| 695 | |||
| 696 | 132 | RemapS16Point5Replicate4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 697 | size_t src_height, svuint32_t& v_src_stride, | ||
| 698 | svint32_t& v_x_max, svint32_t& v_y_max) | ||
| 699 | 132 | : src_rows_{src_rows}, | |
| 700 | 132 | v_src_stride_{v_src_stride}, | |
| 701 | 132 | v_xmax_{v_x_max}, | |
| 702 | 132 | v_ymax_{v_y_max} { | |
| 703 | 132 | v_src_stride_ = svdup_u32(src_rows.stride()); | |
| 704 | 132 | v_xmax_ = svdup_s32(static_cast<int32_t>(src_width - 1)); | |
| 705 | 132 | v_ymax_ = svdup_s32(static_cast<int32_t>(src_height - 1)); | |
| 706 | 132 | } | |
| 707 | |||
| 708 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 709 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 710 | 156 | LoopUnroll loop{width, svcntw()}; | |
| 711 | 10006 | loop.unroll_once([&](size_t step) { | |
| 712 | 19700 | vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), svptrue_b64(), | |
| 713 | 9850 | svptrue_b64(), mapxy, mapfrac, dst, | |
| 714 | 9850 | static_cast<ptrdiff_t>(step)); | |
| 715 | 9850 | }); | |
| 716 | 216 | loop.remaining([&](size_t length, size_t step) { | |
| 717 | 60 | svbool_t pg = svwhilelt_b32_u64(step, step + length); | |
| 718 | 60 | svbool_t pg64_b = svtrn1_b32(pg, svpfalse()); | |
| 719 | 60 | svbool_t pg64_t = svtrn2_b32(pg, svpfalse()); | |
| 720 | 60 | svbool_t pg_low = svzip1_b32(pg, svpfalse()); | |
| 721 | 60 | svbool_t pg_high = svzip2_b32(pg, svpfalse()); | |
| 722 | 120 | vector_path(pg, pg64_b, pg64_t, pg_low, pg_high, mapxy, mapfrac, dst, | |
| 723 | 60 | static_cast<ptrdiff_t>(length)); | |
| 724 | 60 | }); | |
| 725 | 156 | } | |
| 726 | |||
| 727 | 9910 | void vector_path(svbool_t pg, svbool_t pg64_b, svbool_t pg64_t, | |
| 728 | svbool_t pg_low, svbool_t pg_high, | ||
| 729 | Columns<const int16_t>& mapxy, | ||
| 730 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
| 731 | ptrdiff_t step) { | ||
| 732 | // Load one vector of xy: even coordinates are x, odd are y | ||
| 733 | 19820 | svint16_t xy = svreinterpret_s16_u32( | |
| 734 | 9910 | svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0]))); | |
| 735 | 9910 | svint32_t x = svmovlb(xy); | |
| 736 | 9910 | svint32_t y = svmovlt(xy); | |
| 737 | // Clamp coordinates to within the dimensions of the source image | ||
| 738 | 19820 | svuint32_t x0 = svreinterpret_u32_s32( | |
| 739 | 9910 | svmax_x(pg, svdup_n_s32(0), svmin_x(pg, x, v_xmax_))); | |
| 740 | 19820 | svuint32_t y0 = svreinterpret_u32_s32( | |
| 741 | 9910 | svmax_x(pg, svdup_n_s32(0), svmin_x(pg, y, v_ymax_))); | |
| 742 | |||
| 743 | // x1 = x0 + 1, and clamp it too | ||
| 744 | 19820 | svuint32_t x1 = svreinterpret_u32_s32(svmax_x( | |
| 745 | 9910 | pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, x, 1), v_xmax_))); | |
| 746 | 19820 | svuint32_t y1 = svreinterpret_u32_s32(svmax_x( | |
| 747 | 9910 | pg, svdup_n_s32(0), svmin_x(pg, svqadd_n_s32_x(pg, y, 1), v_ymax_))); | |
| 748 | |||
| 749 | 89190 | auto load_4ch = [&](svbool_t pg, svuint64_t offsets) { | |
| 750 | 79280 | return svreinterpret_u16_u64(svld1_gather_u64offset_u64( | |
| 751 | 79280 | pg, reinterpret_cast<const uint64_t*>(&src_rows_[0]), offsets)); | |
| 752 | }; | ||
| 753 | |||
| 754 | 9910 | svuint16_t xfrac, yfrac, nxfrac, nyfrac; | |
| 755 | { | ||
| 756 | // Fractions are loaded into even lanes | ||
| 757 | 9910 | svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); | |
| 758 | |||
| 759 | // Fractions are doubled, 00112233... (will be doubled again later) | ||
| 760 | 9910 | svuint16_t frac = svtrn1(rawfrac, rawfrac); | |
| 761 | |||
| 762 | 9910 | xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 763 | 19820 | yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 764 | 9910 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 765 | 9910 | nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
| 766 | 9910 | nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
| 767 | 9910 | } | |
| 768 | |||
| 769 | 9910 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 770 | |||
| 771 | 29730 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
| 772 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
| 773 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 774 | 19820 | svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); | |
| 775 | 19820 | svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); | |
| 776 | 19820 | svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); | |
| 777 | 19820 | svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); | |
| 778 | |||
| 779 | 39640 | svuint32_t acc_b = | |
| 780 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); | |
| 781 | 39640 | svuint32_t acc_t = | |
| 782 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); | |
| 783 | 19820 | acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); | |
| 784 | 19820 | acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); | |
| 785 | |||
| 786 | 39640 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 787 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 788 | 19820 | }; | |
| 789 | |||
| 790 | // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) | ||
| 791 | // Calculation is done in 2 parts, top and bottom | ||
| 792 | 9910 | svuint16_t res_b, res_t; | |
| 793 | |||
| 794 | { // bottom | ||
| 795 | 9910 | svuint64_t x0w = svshllb_n_u64(x0, 3); | |
| 796 | 9910 | svuint64_t x1w = svshllb_n_u64(x1, 3); | |
| 797 | 9910 | svuint64_t ys0w = svmullb_u64(y0, v_src_stride_); | |
| 798 | 9910 | svuint64_t ys1w = svmullb_u64(y1, v_src_stride_); | |
| 799 | 9910 | svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); | |
| 800 | 9910 | svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); | |
| 801 | 9910 | svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); | |
| 802 | 9910 | svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); | |
| 803 | |||
| 804 | 9910 | svuint16_t a = load_4ch(pg64_b, offsets_a); | |
| 805 | 9910 | svuint16_t b = load_4ch(pg64_b, offsets_b); | |
| 806 | 9910 | svuint16_t c = load_4ch(pg64_b, offsets_c); | |
| 807 | 9910 | svuint16_t d = load_4ch(pg64_b, offsets_d); | |
| 808 | |||
| 809 | // Copy even lanes twice -> 000022224444... these are the "bottom" | ||
| 810 | // fractions | ||
| 811 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 812 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
| 813 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 814 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
| 815 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 816 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
| 817 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 818 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
| 819 | |||
| 820 | 9910 | res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
| 821 | 9910 | } | |
| 822 | |||
| 823 | { // top | ||
| 824 | 9910 | svuint64_t x0w = svshllt_n_u64(x0, 3); | |
| 825 | 9910 | svuint64_t x1w = svshllt_n_u64(x1, 3); | |
| 826 | 9910 | svuint64_t ys0w = svmullt_u64(y0, v_src_stride_); | |
| 827 | 9910 | svuint64_t ys1w = svmullt_u64(y1, v_src_stride_); | |
| 828 | 9910 | svuint64_t offsets_a = svadd_x(pg64_b, x0w, ys0w); | |
| 829 | 9910 | svuint64_t offsets_b = svadd_x(pg64_b, x1w, ys0w); | |
| 830 | 9910 | svuint64_t offsets_c = svadd_x(pg64_b, x0w, ys1w); | |
| 831 | 9910 | svuint64_t offsets_d = svadd_x(pg64_b, x1w, ys1w); | |
| 832 | |||
| 833 | 9910 | svuint16_t a = load_4ch(pg64_t, offsets_a); | |
| 834 | 9910 | svuint16_t b = load_4ch(pg64_t, offsets_b); | |
| 835 | 9910 | svuint16_t c = load_4ch(pg64_t, offsets_c); | |
| 836 | 9910 | svuint16_t d = load_4ch(pg64_t, offsets_d); | |
| 837 | |||
| 838 | // Copy odd lanes twice -> 111133335555... these are the "top" | ||
| 839 | // fractions | ||
| 840 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 841 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
| 842 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 843 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
| 844 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 845 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
| 846 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 847 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
| 848 | |||
| 849 | 9910 | res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
| 850 | 9910 | } | |
| 851 | |||
| 852 | 19820 | svuint64_t res_low = | |
| 853 | 9910 | svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
| 854 | 19820 | svuint64_t res_high = | |
| 855 | 9910 | svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
| 856 | 9910 | svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low); | |
| 857 | 19820 | svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(), | |
| 858 | 9910 | res_high); | |
| 859 | 9910 | mapxy += step; | |
| 860 | 9910 | mapfrac += step; | |
| 861 | 9910 | dst += step; | |
| 862 | 9910 | } | |
| 863 | |||
| 864 | Rows<const ScalarType> src_rows_; | ||
| 865 | |||
| 866 | private: | ||
| 867 | svuint32_t& v_src_stride_; | ||
| 868 | svint32_t& v_xmax_; | ||
| 869 | svint32_t& v_ymax_; | ||
| 870 | }; // end of class RemapS16Point5Replicate4ch<uint16_t> | ||
| 871 | |||
| 872 | template <typename ScalarType> | ||
| 873 | class RemapS16Point5Constant4ch; | ||
| 874 | |||
| 875 | template <> | ||
| 876 | class RemapS16Point5Constant4ch<uint8_t> { | ||
| 877 | public: | ||
| 878 | using ScalarType = uint8_t; | ||
| 879 | |||
| 880 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 881 | size_t src_height, const ScalarType* border_value, | ||
| 882 | svuint16_t& v_src_stride, svuint16_t& v_x_max, | ||
| 883 | svuint16_t& v_y_max, svuint32_t& v_border) | ||
| 884 | 132 | : src_rows_{src_rows}, | |
| 885 | 132 | v_src_stride_{v_src_stride}, | |
| 886 | 132 | v_xmax_{v_x_max}, | |
| 887 | 132 | v_ymax_{v_y_max}, | |
| 888 | 132 | v_border_{v_border} { | |
| 889 | 132 | v_src_stride_ = svdup_u16(src_rows.stride()); | |
| 890 | 132 | v_xmax_ = svdup_u16(static_cast<uint16_t>(src_width - 1)); | |
| 891 | 132 | v_ymax_ = svdup_u16(static_cast<uint16_t>(src_height - 1)); | |
| 892 | 132 | uint32_t border_value_u32{}; | |
| 893 | 132 | memcpy(&border_value_u32, border_value, sizeof(uint32_t)); | |
| 894 | 132 | v_border_ = svdup_u32(border_value_u32); | |
| 895 | 132 | } | |
| 896 | |||
| 897 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 898 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 899 | 156 | LoopUnroll loop{width, svcnth()}; | |
| 900 | 5354 | loop.unroll_once([&](size_t step) { | |
| 901 | 5198 | svbool_t pg = svptrue_b16(); | |
| 902 | 5198 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(step)); | |
| 903 | 5198 | }); | |
| 904 | 230 | loop.remaining([&](size_t length, size_t step) { | |
| 905 | 74 | svbool_t pg = svwhilelt_b16_u64(step - length, step); | |
| 906 | 74 | vector_path(pg, mapxy, mapfrac, dst, static_cast<ptrdiff_t>(length)); | |
| 907 | 74 | }); | |
| 908 | 156 | } | |
| 909 | |||
| 910 | 5272 | void vector_path(svbool_t pg, Columns<const int16_t>& mapxy, | |
| 911 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
| 912 | ptrdiff_t step) { | ||
| 913 | 10544 | svuint16x2_t xy = | |
| 914 | 5272 | svld2_u16(pg, reinterpret_cast<const uint16_t*>(&mapxy[0])); | |
| 915 | 5272 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 916 | |||
| 917 | // Negative values become big positive ones | ||
| 918 | 5272 | svuint16_t x0 = svget2(xy, 0); | |
| 919 | 5272 | svuint16_t y0 = svget2(xy, 1); | |
| 920 | 5272 | svuint16_t x1 = svadd_n_u16_x(pg, x0, 1); | |
| 921 | 5272 | svuint16_t y1 = svadd_n_u16_x(pg, y0, 1); | |
| 922 | |||
| 923 | // Calculate offsets from coordinates (y * stride + x), x multiplied by 4 | ||
| 924 | // channels | ||
| 925 | 26360 | auto load_4ch_or_border_b = [&](svuint16_t x, svuint16_t y) { | |
| 926 | 42176 | svbool_t in_range_b16 = | |
| 927 | 21088 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
| 928 | 21088 | svbool_t in_range = svtrn1_b16(in_range_b16, svpfalse()); | |
| 929 | 42176 | svuint32_t image = svld1_gather_u32offset_u32( | |
| 930 | 21088 | in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
| 931 | 21088 | svmlalb_u32(svshllb_n_u32(x, 2), y, v_src_stride_)); | |
| 932 | 42176 | return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); | |
| 933 | 21088 | }; | |
| 934 | 26360 | auto load_4ch_or_border_t = [&](svuint16_t x, svuint16_t y) { | |
| 935 | 42176 | svbool_t in_range_b16 = | |
| 936 | 21088 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
| 937 | 21088 | svbool_t in_range = svtrn2_b16(in_range_b16, svpfalse()); | |
| 938 | 42176 | svuint32_t image = svld1_gather_u32offset_u32( | |
| 939 | 21088 | in_range, reinterpret_cast<const uint32_t*>(&src_rows_[0]), | |
| 940 | 21088 | svmlalt_u32(svshllt_n_u32(x, 2), y, v_src_stride_)); | |
| 941 | 42176 | return svreinterpret_u8_u32(svsel(in_range, image, v_border_)); | |
| 942 | 21088 | }; | |
| 943 | |||
| 944 | 5272 | svuint16_t frac = svld1_u16(pg, &mapfrac[0]); | |
| 945 | 10544 | svuint16_t xfrac = | |
| 946 | 5272 | svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 947 | 10544 | svuint16_t yfrac = | |
| 948 | 10544 | svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 949 | 5272 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 950 | |||
| 951 | 26360 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
| 952 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
| 953 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 954 | 42176 | svuint16_t line0 = svmla_x( | |
| 955 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_b), nxfrac, src_a); | |
| 956 | 42176 | svuint16_t line1 = svmla_x( | |
| 957 | 21088 | svptrue_b16(), svmul_x(svptrue_b16(), xfrac, src_d), nxfrac, src_c); | |
| 958 | |||
| 959 | 21088 | svuint32_t acc_b = svmlalb_u32(bias, line0, nyfrac); | |
| 960 | 21088 | svuint32_t acc_t = svmlalt_u32(bias, line0, nyfrac); | |
| 961 | 21088 | acc_b = svmlalb_u32(acc_b, line1, yfrac); | |
| 962 | 21088 | acc_t = svmlalt_u32(acc_t, line1, yfrac); | |
| 963 | |||
| 964 | 42176 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 965 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 966 | 21088 | }; | |
| 967 | |||
| 968 | // bottom part | ||
| 969 | 5272 | svuint8_t a = load_4ch_or_border_b(x0, y0); | |
| 970 | 5272 | svuint8_t b = load_4ch_or_border_b(x1, y0); | |
| 971 | 5272 | svuint8_t c = load_4ch_or_border_b(x0, y1); | |
| 972 | 5272 | svuint8_t d = load_4ch_or_border_b(x1, y1); | |
| 973 | // from xfrac, we need the bottom part twice | ||
| 974 | 5272 | svuint16_t xfrac2b = svtrn1_u16(xfrac, xfrac); | |
| 975 | 10544 | svuint16_t nxfrac2b = svsub_u16_x( | |
| 976 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2b); | |
| 977 | 5272 | svuint16_t yfrac2b = svtrn1_u16(yfrac, yfrac); | |
| 978 | 10544 | svuint16_t nyfrac2b = svsub_u16_x( | |
| 979 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2b); | |
| 980 | |||
| 981 | // a,b,c,d looks like 12341234...(four channels) | ||
| 982 | // bottom is 1313... | ||
| 983 | 10544 | svuint16_t res_bb = | |
| 984 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlb_u16(a), | |
| 985 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
| 986 | // top is 2424... | ||
| 987 | 10544 | svuint16_t res_bt = | |
| 988 | 10544 | lerp2d(xfrac2b, yfrac2b, nxfrac2b, nyfrac2b, svmovlt_u16(a), | |
| 989 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
| 990 | 10544 | svuint8_t res_b = | |
| 991 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_bb), svreinterpret_u8_u16(res_bt)); | |
| 992 | |||
| 993 | // top part | ||
| 994 | 5272 | a = load_4ch_or_border_t(x0, y0); | |
| 995 | 5272 | b = load_4ch_or_border_t(x1, y0); | |
| 996 | 5272 | c = load_4ch_or_border_t(x0, y1); | |
| 997 | 5272 | d = load_4ch_or_border_t(x1, y1); | |
| 998 | // from xfrac, we need the top part twice | ||
| 999 | 5272 | svuint16_t xfrac2t = svtrn2_u16(xfrac, xfrac); | |
| 1000 | 10544 | svuint16_t nxfrac2t = svsub_u16_x( | |
| 1001 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac2t); | |
| 1002 | 5272 | svuint16_t yfrac2t = svtrn2_u16(yfrac, yfrac); | |
| 1003 | 10544 | svuint16_t nyfrac2t = svsub_u16_x( | |
| 1004 | 5272 | svptrue_b16(), svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac2t); | |
| 1005 | |||
| 1006 | // a,b,c,d looks like 12341234...(four channels) | ||
| 1007 | // bottom is 1313... | ||
| 1008 | 10544 | svuint16_t res_tb = | |
| 1009 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlb_u16(a), | |
| 1010 | 5272 | svmovlb_u16(b), svmovlb_u16(c), svmovlb_u16(d), bias); | |
| 1011 | // top is 2424... | ||
| 1012 | 10544 | svuint16_t res_tt = | |
| 1013 | 10544 | lerp2d(xfrac2t, yfrac2t, nxfrac2t, nyfrac2t, svmovlt_u16(a), | |
| 1014 | 5272 | svmovlt_u16(b), svmovlt_u16(c), svmovlt_u16(d), bias); | |
| 1015 | 10544 | svuint8_t res_t = | |
| 1016 | 5272 | svtrn1_u8(svreinterpret_u8_u16(res_tb), svreinterpret_u8_u16(res_tt)); | |
| 1017 | |||
| 1018 | 5272 | svbool_t pg_low = svwhilelt_b32_u64(0L, static_cast<size_t>(step)); | |
| 1019 | 5272 | svbool_t pg_high = svwhilelt_b32_u64(svcntw(), static_cast<size_t>(step)); | |
| 1020 | 10544 | svuint32_t res_low = | |
| 1021 | 5272 | svzip1_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
| 1022 | 10544 | svuint32_t res_high = | |
| 1023 | 5272 | svzip2_u32(svreinterpret_u32_u8(res_b), svreinterpret_u32_u8(res_t)); | |
| 1024 | 5272 | mapxy += step; | |
| 1025 | 5272 | svst1_u32(pg_low, reinterpret_cast<uint32_t*>(&dst[0]), res_low); | |
| 1026 | 10544 | svst1_u32(pg_high, reinterpret_cast<uint32_t*>(&dst[0]) + svcntw(), | |
| 1027 | 5272 | res_high); | |
| 1028 | 5272 | mapfrac += step; | |
| 1029 | 5272 | dst += step; | |
| 1030 | 5272 | } | |
| 1031 | |||
| 1032 | Rows<const ScalarType> src_rows_; | ||
| 1033 | |||
| 1034 | private: | ||
| 1035 | svuint16_t& v_src_stride_; | ||
| 1036 | svuint16_t& v_xmax_; | ||
| 1037 | svuint16_t& v_ymax_; | ||
| 1038 | svuint32_t& v_border_; | ||
| 1039 | }; // end of class RemapS16Point5Constant4ch<uint8_t> | ||
| 1040 | |||
| 1041 | template <> | ||
| 1042 | class RemapS16Point5Constant4ch<uint16_t> { | ||
| 1043 | public: | ||
| 1044 | using ScalarType = uint16_t; | ||
| 1045 | |||
| 1046 | 132 | RemapS16Point5Constant4ch(Rows<const ScalarType> src_rows, size_t src_width, | |
| 1047 | size_t src_height, const ScalarType* border_value, | ||
| 1048 | svuint32_t& v_src_stride, svuint32_t& v_x_max, | ||
| 1049 | svuint32_t& v_y_max, svuint64_t& v_border) | ||
| 1050 | 132 | : src_rows_{src_rows}, | |
| 1051 | 132 | v_src_stride_{v_src_stride}, | |
| 1052 | 132 | v_xmax_{v_x_max}, | |
| 1053 | 132 | v_ymax_{v_y_max}, | |
| 1054 | 132 | v_border_{v_border} { | |
| 1055 | 132 | v_src_stride_ = svdup_u32(src_rows.stride()); | |
| 1056 | 132 | v_xmax_ = svdup_u32(static_cast<uint32_t>(src_width - 1)); | |
| 1057 | 132 | v_ymax_ = svdup_u32(static_cast<uint32_t>(src_height - 1)); | |
| 1058 | 132 | uint64_t border_value_u64{}; | |
| 1059 | 132 | memcpy(&border_value_u64, border_value, sizeof(uint64_t)); | |
| 1060 | 132 | v_border_ = svdup_u64(border_value_u64); | |
| 1061 | 132 | } | |
| 1062 | |||
| 1063 | 156 | void process_row(size_t width, Columns<const int16_t> mapxy, | |
| 1064 | Columns<const uint16_t> mapfrac, Columns<ScalarType> dst) { | ||
| 1065 | 156 | LoopUnroll loop{width, svcntw()}; | |
| 1066 | 10006 | loop.unroll_once([&](size_t step) { | |
| 1067 | 19700 | vector_path(svptrue_b32(), svptrue_b64(), svptrue_b64(), mapxy, mapfrac, | |
| 1068 | 9850 | dst, static_cast<ptrdiff_t>(step)); | |
| 1069 | 9850 | }); | |
| 1070 | 216 | loop.remaining([&](size_t length, size_t step) { | |
| 1071 | 60 | svbool_t pg = svwhilelt_b32_u64(step, step + length); | |
| 1072 | 60 | svbool_t pg_low = svzip1_b32(pg, svpfalse()); | |
| 1073 | 60 | svbool_t pg_high = svzip2_b32(pg, svpfalse()); | |
| 1074 | 120 | vector_path(pg, pg_low, pg_high, mapxy, mapfrac, dst, | |
| 1075 | 60 | static_cast<ptrdiff_t>(length)); | |
| 1076 | 60 | }); | |
| 1077 | 156 | } | |
| 1078 | |||
| 1079 | 9910 | void vector_path(svbool_t pg, svbool_t pg_low, svbool_t pg_high, | |
| 1080 | Columns<const int16_t>& mapxy, | ||
| 1081 | Columns<const uint16_t>& mapfrac, Columns<ScalarType>& dst, | ||
| 1082 | ptrdiff_t step) { | ||
| 1083 | // Load one vector of xy: even coordinates are x, odd are y | ||
| 1084 | 19820 | svint16_t xy = svreinterpret_s16_u32( | |
| 1085 | 9910 | svld1_u32(pg, reinterpret_cast<const uint32_t*>(&mapxy[0]))); | |
| 1086 | |||
| 1087 | // Negative values become big positive ones | ||
| 1088 | // Widening is signed, so 16-bit -1 becomes 32-bit -1 | ||
| 1089 | 9910 | svuint32_t x0 = svreinterpret_u32_s32(svmovlb(xy)); | |
| 1090 | 9910 | svuint32_t y0 = svreinterpret_u32_s32(svmovlt(xy)); | |
| 1091 | 9910 | svuint32_t x1 = svadd_n_u32_x(pg, x0, 1); | |
| 1092 | 9910 | svuint32_t y1 = svadd_n_u32_x(pg, y0, 1); | |
| 1093 | |||
| 1094 | 49550 | auto load_4ch_or_border_b = [&](svuint32_t x, svuint32_t y) { | |
| 1095 | 79280 | svbool_t in_range_b32 = | |
| 1096 | 39640 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
| 1097 | 39640 | svbool_t in_range = svtrn1_b32(in_range_b32, svpfalse()); | |
| 1098 | 79280 | svuint64_t image = svld1_gather_u64offset_u64( | |
| 1099 | 39640 | in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]), | |
| 1100 | 39640 | svmlalb_u64(svshllb_n_u64(x, 3), y, v_src_stride_)); | |
| 1101 | 79280 | return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); | |
| 1102 | 39640 | }; | |
| 1103 | |||
| 1104 | 49550 | auto load_4ch_or_border_t = [&](svuint32_t x, svuint32_t y) { | |
| 1105 | 79280 | svbool_t in_range_b32 = | |
| 1106 | 39640 | svand_b_z(pg, svcmple(pg, x, v_xmax_), svcmple(pg, y, v_ymax_)); | |
| 1107 | 39640 | svbool_t in_range = svtrn2_b32(in_range_b32, svpfalse()); | |
| 1108 | 79280 | svuint64_t image = svld1_gather_u64offset_u64( | |
| 1109 | 39640 | in_range, reinterpret_cast<const uint64_t*>(&src_rows_[0]), | |
| 1110 | 39640 | svmlalt_u64(svshllt_n_u64(x, 3), y, v_src_stride_)); | |
| 1111 | 79280 | return svreinterpret_u16_u64(svsel(in_range, image, v_border_)); | |
| 1112 | 39640 | }; | |
| 1113 | |||
| 1114 | 9910 | svuint16_t xfrac, yfrac, nxfrac, nyfrac; | |
| 1115 | { | ||
| 1116 | // Fractions are loaded into even lanes | ||
| 1117 | 9910 | svuint16_t rawfrac = svreinterpret_u16_u32(svld1uh_u32(pg, &mapfrac[0])); | |
| 1118 | |||
| 1119 | // Fractions are doubled, 00112233... (will be doubled again later) | ||
| 1120 | 9910 | svuint16_t frac = svtrn1(rawfrac, rawfrac); | |
| 1121 | |||
| 1122 | 9910 | xfrac = svand_x(pg, frac, svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 1123 | 19820 | yfrac = svand_x(pg, svlsr_n_u16_x(pg, frac, REMAP16POINT5_FRAC_BITS), | |
| 1124 | 9910 | svdup_n_u16(REMAP16POINT5_FRAC_MAX - 1)); | |
| 1125 | 9910 | nxfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), xfrac); | |
| 1126 | 9910 | nyfrac = svsub_u16_x(pg, svdup_n_u16(REMAP16POINT5_FRAC_MAX), yfrac); | |
| 1127 | 9910 | } | |
| 1128 | |||
| 1129 | 9910 | svuint32_t bias = svdup_n_u32(REMAP16POINT5_FRAC_MAX_SQUARE / 2); | |
| 1130 | |||
| 1131 | 29730 | auto lerp2d = [&](svuint16_t xfrac, svuint16_t yfrac, svuint16_t nxfrac, | |
| 1132 | svuint16_t nyfrac, svuint16_t src_a, svuint16_t src_b, | ||
| 1133 | svuint16_t src_c, svuint16_t src_d, svuint32_t bias) { | ||
| 1134 | 19820 | svuint32_t line0_b = svmlalb(svmullb(xfrac, src_b), nxfrac, src_a); | |
| 1135 | 19820 | svuint32_t line0_t = svmlalt(svmullt(xfrac, src_b), nxfrac, src_a); | |
| 1136 | 19820 | svuint32_t line1_b = svmlalb(svmullb(xfrac, src_d), nxfrac, src_c); | |
| 1137 | 19820 | svuint32_t line1_t = svmlalt(svmullt(xfrac, src_d), nxfrac, src_c); | |
| 1138 | |||
| 1139 | 39640 | svuint32_t acc_b = | |
| 1140 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_b, svmovlb_u32(nyfrac)); | |
| 1141 | 39640 | svuint32_t acc_t = | |
| 1142 | 19820 | svmla_u32_x(svptrue_b32(), bias, line0_t, svmovlt_u32(nyfrac)); | |
| 1143 | 19820 | acc_b = svmla_u32_x(svptrue_b32(), acc_b, line1_b, svmovlb_u32(yfrac)); | |
| 1144 | 19820 | acc_t = svmla_u32_x(svptrue_b32(), acc_t, line1_t, svmovlt_u32(yfrac)); | |
| 1145 | |||
| 1146 | 39640 | return svshrnt(svshrnb(acc_b, 2ULL * REMAP16POINT5_FRAC_BITS), acc_t, | |
| 1147 | 2ULL * REMAP16POINT5_FRAC_BITS); | ||
| 1148 | 19820 | }; | |
| 1149 | |||
| 1150 | // Data is 4x16 = 64 bits, twice as wide as the widened coords (32-bit) | ||
| 1151 | // Calculation is done in 2 parts, top and bottom | ||
| 1152 | 9910 | svuint16_t res_b, res_t; | |
| 1153 | |||
| 1154 | { // bottom | ||
| 1155 | 9910 | svuint16_t a = load_4ch_or_border_b(x0, y0); | |
| 1156 | 9910 | svuint16_t b = load_4ch_or_border_b(x1, y0); | |
| 1157 | 9910 | svuint16_t c = load_4ch_or_border_b(x0, y1); | |
| 1158 | 9910 | svuint16_t d = load_4ch_or_border_b(x1, y1); | |
| 1159 | |||
| 1160 | // Copy even lanes twice -> 000022224444... these are the "bottom" | ||
| 1161 | // fractions | ||
| 1162 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 1163 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
| 1164 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 1165 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
| 1166 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 1167 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
| 1168 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn1_u32( | |
| 1169 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
| 1170 | |||
| 1171 | 9910 | res_b = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
| 1172 | 9910 | } | |
| 1173 | |||
| 1174 | { // top | ||
| 1175 | 9910 | svuint16_t a = load_4ch_or_border_t(x0, y0); | |
| 1176 | 9910 | svuint16_t b = load_4ch_or_border_t(x1, y0); | |
| 1177 | 9910 | svuint16_t c = load_4ch_or_border_t(x0, y1); | |
| 1178 | 9910 | svuint16_t d = load_4ch_or_border_t(x1, y1); | |
| 1179 | |||
| 1180 | // Copy odd lanes twice -> 111133335555... these are the "top" | ||
| 1181 | // fractions | ||
| 1182 | 19820 | svuint16_t xfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 1183 | 9910 | svreinterpret_u32_u16(xfrac), svreinterpret_u32_u16(xfrac))); | |
| 1184 | 19820 | svuint16_t nxfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 1185 | 9910 | svreinterpret_u32_u16(nxfrac), svreinterpret_u32_u16(nxfrac))); | |
| 1186 | 19820 | svuint16_t yfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 1187 | 9910 | svreinterpret_u32_u16(yfrac), svreinterpret_u32_u16(yfrac))); | |
| 1188 | 19820 | svuint16_t nyfr = svreinterpret_u16_u32(svtrn2_u32( | |
| 1189 | 9910 | svreinterpret_u32_u16(nyfrac), svreinterpret_u32_u16(nyfrac))); | |
| 1190 | |||
| 1191 | 9910 | res_t = lerp2d(xfr, yfr, nxfr, nyfr, a, b, c, d, bias); | |
| 1192 | 9910 | } | |
| 1193 | |||
| 1194 | 19820 | svuint64_t res_low = | |
| 1195 | 9910 | svzip1_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
| 1196 | 19820 | svuint64_t res_high = | |
| 1197 | 9910 | svzip2_u64(svreinterpret_u64_u16(res_b), svreinterpret_u64_u16(res_t)); | |
| 1198 | 9910 | svst1_u64(pg_low, reinterpret_cast<uint64_t*>(&dst[0]), res_low); | |
| 1199 | 19820 | svst1_u64(pg_high, reinterpret_cast<uint64_t*>(&dst[0]) + svcntd(), | |
| 1200 | 9910 | res_high); | |
| 1201 | 9910 | mapxy += step; | |
| 1202 | 9910 | mapfrac += step; | |
| 1203 | 9910 | dst += step; | |
| 1204 | 9910 | } | |
| 1205 | |||
| 1206 | Rows<const ScalarType> src_rows_; | ||
| 1207 | |||
| 1208 | private: | ||
| 1209 | svuint32_t& v_src_stride_; | ||
| 1210 | svuint32_t& v_xmax_; | ||
| 1211 | svuint32_t& v_ymax_; | ||
| 1212 | svuint64_t& v_border_; | ||
| 1213 | }; // end of class RemapS16Point5Constant4ch<uint16_t> | ||
| 1214 | |||
| 1215 | // Most of the complexity comes from parameter checking. | ||
| 1216 | // NOLINTBEGIN(readability-function-cognitive-complexity) | ||
| 1217 | template <typename T> | ||
| 1218 | 1140 | kleidicv_error_t remap_s16point5(const T* src, size_t src_stride, | |
| 1219 | size_t src_width, size_t src_height, T* dst, | ||
| 1220 | size_t dst_stride, size_t dst_width, | ||
| 1221 | size_t dst_height, size_t channels, | ||
| 1222 | const int16_t* mapxy, size_t mapxy_stride, | ||
| 1223 | const uint16_t* mapfrac, size_t mapfrac_stride, | ||
| 1224 | kleidicv_border_type_t border_type, | ||
| 1225 | const T* border_value) { | ||
| 1226 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 568 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 568 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 568 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 568 times.
|
1140 | CHECK_POINTER_AND_STRIDE(src, src_stride, src_height); |
| 1227 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 566 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 566 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 566 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 566 times.
|
1136 | CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height); |
| 1228 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 564 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 564 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 564 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 564 times.
|
1132 | CHECK_POINTER_AND_STRIDE(mapxy, mapxy_stride, dst_height); |
| 1229 |
8/8✓ Branch 0 taken 2 times.
✓ Branch 1 taken 562 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 562 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 562 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 562 times.
|
1128 | CHECK_POINTER_AND_STRIDE(mapfrac, mapfrac_stride, dst_height); |
| 1230 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 560 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 556 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 556 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 560 times.
✓ Branch 8 taken 4 times.
✓ Branch 9 taken 556 times.
✓ Branch 10 taken 6 times.
✓ Branch 11 taken 556 times.
|
1124 | CHECK_IMAGE_SIZE(src_width, src_height); |
| 1231 |
12/12✓ Branch 0 taken 2 times.
✓ Branch 1 taken 554 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 552 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 552 times.
✓ Branch 6 taken 2 times.
✓ Branch 7 taken 554 times.
✓ Branch 8 taken 2 times.
✓ Branch 9 taken 552 times.
✓ Branch 10 taken 4 times.
✓ Branch 11 taken 552 times.
|
1112 | CHECK_IMAGE_SIZE(dst_width, dst_height); |
| 1232 |
8/8✓ Branch 0 taken 268 times.
✓ Branch 1 taken 284 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 2 times.
✓ Branch 4 taken 268 times.
✓ Branch 5 taken 284 times.
✓ Branch 6 taken 266 times.
✓ Branch 7 taken 2 times.
|
1104 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT && nullptr == border_value) { |
| 1233 | 4 | return KLEIDICV_ERROR_NULL_POINTER; | |
| 1234 | } | ||
| 1235 | |||
| 1236 |
8/8✓ Branch 0 taken 530 times.
✓ Branch 1 taken 20 times.
✓ Branch 2 taken 530 times.
✓ Branch 3 taken 20 times.
✓ Branch 4 taken 530 times.
✓ Branch 5 taken 20 times.
✓ Branch 6 taken 530 times.
✓ Branch 7 taken 20 times.
|
2200 | if (!remap_s16point5_is_implemented<T>(src_stride, src_width, src_height, |
| 1237 | 1100 | dst_width, border_type, channels)) { | |
| 1238 | 40 | return KLEIDICV_ERROR_NOT_IMPLEMENTED; | |
| 1239 | } | ||
| 1240 | |||
| 1241 | 1060 | Rows<const T> src_rows{src, src_stride, channels}; | |
| 1242 | 1060 | Rows<const int16_t> mapxy_rows{mapxy, mapxy_stride, 2}; | |
| 1243 | 1060 | Rows<const uint16_t> mapfrac_rows{mapfrac, mapfrac_stride, 1}; | |
| 1244 | 1060 | Rows<T> dst_rows{dst, dst_stride, channels}; | |
| 1245 | 1060 | svuint16_t sv_src_stride; | |
| 1246 | 1060 | Rectangle rect{dst_width, dst_height}; | |
| 1247 | |||
| 1248 |
4/4✓ Branch 0 taken 266 times.
✓ Branch 1 taken 264 times.
✓ Branch 2 taken 266 times.
✓ Branch 3 taken 264 times.
|
1060 | if (border_type == KLEIDICV_BORDER_TYPE_CONSTANT) { |
| 1249 |
4/4✓ Branch 0 taken 132 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 132 times.
✓ Branch 3 taken 132 times.
|
528 | if (channels == 1) { |
| 1250 | 264 | svuint16_t sv_width, sv_height, sv_border; | |
| 1251 | 528 | RemapS16Point5ConstantBorder<T> operation{ | |
| 1252 | 264 | src_rows, src_width, src_height, border_value, | |
| 1253 | sv_src_stride, sv_width, sv_height, sv_border}; | ||
| 1254 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1255 | 264 | } else { | |
| 1256 | assert(channels == 4); | ||
| 1257 | typedef typename double_element_width<T>::type DoubleType; | ||
| 1258 | typedef typename double_element_width<DoubleType>::type QuadType; | ||
| 1259 | 264 | typename VecTraits<DoubleType>::VectorType sv_width, sv_height, | |
| 1260 | sv_src_stride; | ||
| 1261 | 264 | typename VecTraits<QuadType>::VectorType sv_border; | |
| 1262 | 528 | RemapS16Point5Constant4ch<T> operation{ | |
| 1263 | 264 | src_rows, src_width, src_height, border_value, | |
| 1264 | sv_src_stride, sv_width, sv_height, sv_border}; | ||
| 1265 | 264 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1266 | 264 | } | |
| 1267 | 528 | } else { | |
| 1268 | assert(border_type == KLEIDICV_BORDER_TYPE_REPLICATE); | ||
| 1269 | 532 | svint16_t sv_xmax, sv_ymax; | |
| 1270 |
4/4✓ Branch 0 taken 134 times.
✓ Branch 1 taken 132 times.
✓ Branch 2 taken 134 times.
✓ Branch 3 taken 132 times.
|
532 | if (channels == 1) { |
| 1271 | 268 | RemapS16Point5Replicate<T> operation{src_rows, src_width, src_height, | |
| 1272 | sv_src_stride, sv_xmax, sv_ymax}; | ||
| 1273 | 268 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1274 | 268 | } else { | |
| 1275 | assert(channels == 4); | ||
| 1276 | if constexpr (std::is_same<T, uint8_t>::value) { | ||
| 1277 | 264 | RemapS16Point5Replicate4ch<T> operation{ | |
| 1278 | 132 | src_rows, src_width, src_height, sv_src_stride, sv_xmax, sv_ymax}; | |
| 1279 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1280 | 132 | } | |
| 1281 | if constexpr (std::is_same<T, uint16_t>::value) { | ||
| 1282 | 132 | svuint32_t stride; | |
| 1283 | 132 | svint32_t xmax, ymax; | |
| 1284 | 132 | RemapS16Point5Replicate4ch<T> operation{src_rows, src_width, src_height, | |
| 1285 | stride, xmax, ymax}; | ||
| 1286 | 132 | zip_rows(operation, rect, mapxy_rows, mapfrac_rows, dst_rows); | |
| 1287 | 132 | } | |
| 1288 | } | ||
| 1289 | 532 | } | |
| 1290 | 1060 | return KLEIDICV_OK; | |
| 1291 | 1140 | } | |
| 1292 | // NOLINTEND(readability-function-cognitive-complexity) | ||
| 1293 | |||
| 1294 | #define KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(type) \ | ||
| 1295 | template KLEIDICV_TARGET_FN_ATTRS kleidicv_error_t remap_s16point5<type>( \ | ||
| 1296 | const type* src, size_t src_stride, size_t src_width, size_t src_height, \ | ||
| 1297 | type* dst, size_t dst_stride, size_t dst_width, size_t dst_height, \ | ||
| 1298 | size_t channels, const int16_t* mapxy, size_t mapxy_stride, \ | ||
| 1299 | const uint16_t* mapfrac, size_t mapfrac_stride, \ | ||
| 1300 | kleidicv_border_type_t border_type, const type* border_value) | ||
| 1301 | |||
| 1302 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint8_t); | ||
| 1303 | KLEIDICV_INSTANTIATE_TEMPLATE_REMAP_S16Point5(uint16_t); | ||
| 1304 | |||
| 1305 | } // namespace kleidicv::sve2 | ||
| 1306 |