KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/transform/transform_sve2.h
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	171	171	100.0%
Functions:	0	0	-%
Branches:	0	0	-%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <cassert>
    
      #include <cmath>
    
      #include <cstddef>
    
      #include <cstdint>
    
      #include "kleidicv/sve2.h"
    
      #include "kleidicv/types.h"
    
      #include "transform_common.h"
    
      namespace kleidicv::sve2 {
    
      template <typename ScalarType, bool IsLarge>
    
      1686734
      svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y,
    
                                svuint32_t sv_src_stride,
    
                                Rows<const ScalarType> &src_rows) {
    
        if constexpr (IsLarge) {
    
      7380
          svbool_t pg_b = pg;
    
      7380
          svbool_t pg_t = svtrn2_b32(pg, svpfalse());
    
          // Calculate offsets from coordinates (y * stride + x)
    
          // To avoid losing precision, the final offsets should be in 64 bits
    
      7380
          svuint64_t result_b, result_t;
    
          if constexpr (std::is_same<ScalarType, uint8_t>::value) {
    
      6180
            svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
    
      6180
            svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
    
            // Copy pixels from source
    
      6180
            result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
    
      6180
            result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
    
      6180
          }
    
          if constexpr (std::is_same<ScalarType, uint16_t>::value) {
    
            // Multiply x with sizeof(uint16_t)
    
      1200
            svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
    
      1200
            svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
    
            // Copy pixels from source
    
      1200
            result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
    
      1200
            result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
    
      1200
          }
    
      22140
          return svtrn1_u32(svreinterpret_u32_u64(result_b),
    
      7380
                            svreinterpret_u32_u64(result_t));
    
      7380
        } else {
    
          if constexpr (std::is_same<ScalarType, uint8_t>::value) {
    
      1613158
            svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
    
      3226316
            return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
    
      1613158
          } else {
    
            // Multiply by sizeof(uint16_t)
    
      66196
            x = svlsl_x(pg, x, 1);
    
      66196
            svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
    
      132392
            return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets);
    
      66196
          }
    
        }
    
      }
    
      template <typename ScalarType, bool IsLarge>
    
      143792
      svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
    
                                    svuint32_t sv_src_stride,
    
                                    Rows<const ScalarType> &src_rows,
    
                                    [[maybe_unused]] svuint8_t load_table) {
    
        if constexpr (IsLarge) {
    
      3600
          svbool_t pg_b = pg;
    
      3600
          svbool_t pg_t = svtrn2_b32(pg, svpfalse());
    
          // Calculate offsets from coordinates (y * stride + x)
    
          // To avoid losing precision, the final offsets should be in 64 bits
    
          if constexpr (std::is_same<ScalarType, uint8_t>::value) {
    
            // Multiply x with the number of channels
    
      2400
            svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
    
      2400
            svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
    
            // Copy pixels from source
    
      4800
            svuint64_t b = svld1uh_gather_offset_u64(
    
      2400
                pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b);
    
      4800
            svuint64_t t = svld1uh_gather_offset_u64(
    
      2400
                pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t);
    
      4800
            svuint32_t r32 =
    
      2400
                svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t));
    
      4800
            return svreinterpret_u32_u8(
    
      2400
                svtbl_u8(svreinterpret_u8_u32(r32), load_table));
    
      2400
          }
    
          if constexpr (std::is_same<ScalarType, uint16_t>::value) {
    
            // Multiply x with the number of channels and sizeof(uint16_t)
    
      1200
            svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride);
    
      1200
            svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride);
    
            // Copy pixels from source
    
      2400
            svuint64_t result_b = svld1uw_gather_offset_u64(
    
      1200
                pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b);
    
      2400
            svuint64_t result_t = svld1uw_gather_offset_u64(
    
      1200
                pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t);
    
      3600
            return svtrn1_u32(svreinterpret_u32_u64(result_b),
    
      1200
                              svreinterpret_u32_u64(result_t));
    
      1200
          }
    
      3600
        } else {
    
          // Multiply x with the number of channels and sizeof(ScalarType)
    
          // This shifting formula is only correct for 8 and 16 bits
    
      140192
          x = svlsl_n_u32_x(pg, x, sizeof(ScalarType));
    
      140192
          svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
    
          if constexpr (std::is_same<ScalarType, uint8_t>::value) {
    
      145592
            svuint32_t r32 = svld1uh_gather_offset_u32(
    
      72796
                pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets);
    
      145592
            return svreinterpret_u32_u8(
    
      72796
                svtbl_u8(svreinterpret_u8_u32(r32), load_table));
    
      72796
          }
    
          if constexpr (std::is_same<ScalarType, uint16_t>::value) {
    
      134792
            return svld1_gather_u32offset_u32(
    
      67396
                pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets);
    
          }
    
      140192
        }
    
      }
    
      template <typename ScalarType, bool IsLarge, size_t Channels>
    
      247686
      svuint32_t inline calculate_linear_replicated_border(
    
          svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf,
    
          svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows,
    
          svuint8_t load_table_2ch) {
    
      247686
        svbool_t pg_all32 = svptrue_b32();
    
      1238430
        auto load_source = [&](svuint32_t x, svuint32_t y) {
    
          if constexpr (Channels == 1) {
    
      920824
            return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
    
          }
    
          if constexpr (Channels == 2) {
    
      139840
            return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows,
    
      69920
                                                    load_table_2ch);
    
          }
    
        };
    
      247686
        svfloat32_t xf = svget2(coords, 0);
    
      247686
        svfloat32_t yf = svget2(coords, 1);
    
        // Take the integer part, clamp it to within the dimensions of the
    
        // source image (negative values are already saturated to 0)
    
      247686
        svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
    
      247686
        svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
    
        // Get fractional part, or 0 if out of range
    
      495372
        svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
    
      247686
                                      svcmplt_f32(pg_all32, xf, xmaxf));
    
      495372
        svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
    
      247686
                                      svcmplt_f32(pg_all32, yf, ymaxf));
    
      495372
        svfloat32_t xfrac =
    
      495372
            svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
    
      247686
                      svdup_n_f32(0.F));
    
      495372
        svfloat32_t yfrac =
    
      495372
            svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
    
      247686
                      svdup_n_f32(0.F));
    
        // x1 = x0 + 1, except if it's already xmax or out of range
    
      247686
        svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
    
      247686
        svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
    
      512852
        auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
    
                           svuint32_t di) {
    
      265166
          svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
    
      265166
          svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
    
      530332
          svfloat32_t line0 =
    
      265166
              svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
    
      265166
          svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
    
      265166
          svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
    
      530332
          svfloat32_t line1 =
    
      265166
              svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
    
      530332
          svfloat32_t result = svmla_f32_x(
    
      265166
              pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
    
      530332
          return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
    
      265166
        };
    
        // Calculate offsets from coordinates (y * stride + x)
    
        // a: top left, b: top right, c: bottom left, d: bottom right
    
      247686
        svuint32_t a = load_source(x0, y0);
    
      247686
        svuint32_t b = load_source(x1, y0);
    
      247686
        svuint32_t c = load_source(x0, y1);
    
      247686
        svuint32_t d = load_source(x1, y1);
    
        if constexpr (Channels == 1) {
    
      460412
          return lerp_2d(a, b, c, d);
    
        }
    
        if constexpr (Channels == 2) {
    
          // Channel 0
    
      34960
          svuint32_t res32_0 = lerp_2d(
    
      17480
              svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
    
      17480
              svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
    
          // Channel 1
    
      34960
          svuint32_t res32_1 = lerp_2d(
    
      17480
              svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
    
      17480
              svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
    
      52440
          return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
    
      17480
                                                  svreinterpret_u16_u32(res32_1)));
    
      17480
        }
    
      247686
      }
    
      template <typename ScalarType, bool IsLarge>
    
      473152
      svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
    
                                      svuint32_t sv_border, svuint32_t sv_xmax,
    
                                      svuint32_t sv_ymax, svuint32_t sv_src_stride,
    
                                      Rows<const ScalarType> &src_rows) {
    
      946304
        svbool_t in_range =
    
      473152
            svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
    
      946304
        svuint32_t result =
    
      473152
            load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
    
        // Select between source pixels and border colour
    
      946304
        return svsel_u32(in_range, result, sv_border);
    
      473152
      }
    
      template <typename ScalarType, bool IsLarge>
    
      69920
      svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
    
                                          svuint32_t sv_border, svuint32_t sv_xmax,
    
                                          svuint32_t sv_ymax,
    
                                          svuint32_t sv_src_stride,
    
                                          Rows<const ScalarType> &src_rows,
    
                                          svuint8_t load_table) {
    
      139840
        svbool_t in_range =
    
      69920
            svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
    
      139840
        svuint32_t result = load_xy_2ch<ScalarType, IsLarge>(
    
      69920
            in_range, x, y, sv_src_stride, src_rows, load_table);
    
        // Select between source pixels and border colour
    
      139840
        return svsel_u32(in_range, result, sv_border);
    
      69920
      }
    
      template <typename ScalarType, bool IsLarge, size_t Channels>
    
      135768
      svuint32_t inline calculate_linear_constant_border(
    
          svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
    
          svuint32_t sv_ymax, svuint32_t sv_src_stride,
    
          Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) {
    
      135768
        svbool_t pg_all32 = svptrue_b32();
    
      678840
        auto load_source = [&](svuint32_t x, svuint32_t y) {
    
          if constexpr (Channels == 1) {
    
      473152
            return get_pixels_or_border<ScalarType, IsLarge>(
    
      473152
                pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
    
          }
    
          if constexpr (Channels == 2) {
    
      69920
            return get_pixels_or_border_2ch<ScalarType, IsLarge>(
    
      69920
                pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
    
      69920
                load_table_2ch);
    
          }
    
        };
    
        // Convert coordinates to integers, truncating towards minus infinity.
    
        // Negative numbers will become large positive numbers.
    
        // Since the source width and height is known to be <=2^24 these large
    
        // positive numbers will always be treated as outside the source image
    
        // bounds.
    
      135768
        svuint32_t x0, y0, x1, y1;
    
      135768
        svfloat32_t xfrac, yfrac;
    
        {
    
      135768
          svfloat32_t xf = svget2(coords, 0);
    
      135768
          svfloat32_t yf = svget2(coords, 1);
    
      135768
          svfloat32_t xf0 = svrintm_f32_x(pg, xf);
    
      135768
          svfloat32_t yf0 = svrintm_f32_x(pg, yf);
    
      135768
          x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0));
    
      135768
          y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0));
    
      135768
          x1 = svadd_u32_x(pg, x0, svdup_n_u32(1));
    
      135768
          y1 = svadd_u32_x(pg, y0, svdup_n_u32(1));
    
      135768
          xfrac = svsub_f32_x(pg, xf, xf0);
    
      135768
          yfrac = svsub_f32_x(pg, yf, yf0);
    
      135768
        }
    
      289016
        auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
    
                           svuint32_t di) {
    
      153248
          svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
    
      153248
          svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
    
      306496
          svfloat32_t line0 =
    
      153248
              svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
    
      153248
          svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
    
      153248
          svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
    
      306496
          svfloat32_t line1 =
    
      153248
              svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
    
      306496
          svfloat32_t result = svmla_f32_x(
    
      153248
              pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
    
      306496
          return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
    
      153248
        };
    
        // Calculate offsets from coordinates (y * stride + x)
    
        // a: top left, b: top right, c: bottom left, d: bottom right
    
      135768
        svuint32_t a = load_source(x0, y0);
    
      135768
        svuint32_t b = load_source(x1, y0);
    
      135768
        svuint32_t c = load_source(x0, y1);
    
      135768
        svuint32_t d = load_source(x1, y1);
    
        if constexpr (Channels == 1) {
    
      236576
          return lerp_2d(a, b, c, d);
    
        }
    
        if constexpr (Channels == 2) {
    
          // Channel 0
    
      34960
          svuint32_t res32_0 = lerp_2d(
    
      17480
              svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
    
      17480
              svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
    
          // Channel 1
    
      34960
          svuint32_t res32_1 = lerp_2d(
    
      17480
              svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
    
      17480
              svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
    
      52440
          return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
    
      17480
                                                  svreinterpret_u16_u32(res32_1)));
    
      17480
        }
    
      135768
      }
    
      }  // namespace kleidicv::sve2

Line	Exec	Source
1		// SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2		//
3		// SPDX-License-Identifier: Apache-2.0
4
5		#include <cassert>
6		#include <cmath>
7		#include <cstddef>
8		#include <cstdint>
9
10		#include "kleidicv/sve2.h"
11		#include "kleidicv/types.h"
12		#include "transform_common.h"
13
14		namespace kleidicv::sve2 {
15
16		template <typename ScalarType, bool IsLarge>
17	1686734	svuint32_t inline load_xy(svbool_t pg, svuint32_t x, svuint32_t y,
18		svuint32_t sv_src_stride,
19		Rows<const ScalarType> &src_rows) {
20		if constexpr (IsLarge) {
21	7380	svbool_t pg_b = pg;
22	7380	svbool_t pg_t = svtrn2_b32(pg, svpfalse());
23
24		// Calculate offsets from coordinates (y * stride + x)
25		// To avoid losing precision, the final offsets should be in 64 bits
26	7380	svuint64_t result_b, result_t;
27		if constexpr (std::is_same<ScalarType, uint8_t>::value) {
28	6180	svuint64_t offsets_b = svmlalb(svmovlb(x), y, sv_src_stride);
29	6180	svuint64_t offsets_t = svmlalt(svmovlt(x), y, sv_src_stride);
30		// Copy pixels from source
31	6180	result_b = svld1ub_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
32	6180	result_t = svld1ub_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
33	6180	}
34		if constexpr (std::is_same<ScalarType, uint16_t>::value) {
35		// Multiply x with sizeof(uint16_t)
36	1200	svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
37	1200	svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
38		// Copy pixels from source
39	1200	result_b = svld1uh_gather_offset_u64(pg_b, &src_rows[0], offsets_b);
40	1200	result_t = svld1uh_gather_offset_u64(pg_t, &src_rows[0], offsets_t);
41	1200	}
42	22140	return svtrn1_u32(svreinterpret_u32_u64(result_b),
43	7380	svreinterpret_u32_u64(result_t));
44	7380	} else {
45		if constexpr (std::is_same<ScalarType, uint8_t>::value) {
46	1613158	svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
47	3226316	return svld1ub_gather_offset_u32(pg, &src_rows[0], offsets);
48	1613158	} else {
49		// Multiply by sizeof(uint16_t)
50	66196	x = svlsl_x(pg, x, 1);
51	66196	svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
52	132392	return svld1uh_gather_offset_u32(pg, &src_rows[0], offsets);
53	66196	}
54		}
55		}
56
57		template <typename ScalarType, bool IsLarge>
58	143792	svuint32_t inline load_xy_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
59		svuint32_t sv_src_stride,
60		Rows<const ScalarType> &src_rows,
61		[[maybe_unused]] svuint8_t load_table) {
62		if constexpr (IsLarge) {
63	3600	svbool_t pg_b = pg;
64	3600	svbool_t pg_t = svtrn2_b32(pg, svpfalse());
65
66		// Calculate offsets from coordinates (y * stride + x)
67		// To avoid losing precision, the final offsets should be in 64 bits
68		if constexpr (std::is_same<ScalarType, uint8_t>::value) {
69		// Multiply x with the number of channels
70	2400	svuint64_t offsets_b = svmlalb(svshllb(x, 1), y, sv_src_stride);
71	2400	svuint64_t offsets_t = svmlalt(svshllt(x, 1), y, sv_src_stride);
72		// Copy pixels from source
73	4800	svuint64_t b = svld1uh_gather_offset_u64(
74	2400	pg_b, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_b);
75	4800	svuint64_t t = svld1uh_gather_offset_u64(
76	2400	pg_t, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets_t);
77	4800	svuint32_t r32 =
78	2400	svtrn1_u32(svreinterpret_u32_u64(b), svreinterpret_u32_u64(t));
79	4800	return svreinterpret_u32_u8(
80	2400	svtbl_u8(svreinterpret_u8_u32(r32), load_table));
81	2400	}
82		if constexpr (std::is_same<ScalarType, uint16_t>::value) {
83		// Multiply x with the number of channels and sizeof(uint16_t)
84	1200	svuint64_t offsets_b = svmlalb(svshllb(x, 2), y, sv_src_stride);
85	1200	svuint64_t offsets_t = svmlalt(svshllt(x, 2), y, sv_src_stride);
86		// Copy pixels from source
87	2400	svuint64_t result_b = svld1uw_gather_offset_u64(
88	1200	pg_b, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_b);
89	2400	svuint64_t result_t = svld1uw_gather_offset_u64(
90	1200	pg_t, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets_t);
91	3600	return svtrn1_u32(svreinterpret_u32_u64(result_b),
92	1200	svreinterpret_u32_u64(result_t));
93	1200	}
94	3600	} else {
95		// Multiply x with the number of channels and sizeof(ScalarType)
96		// This shifting formula is only correct for 8 and 16 bits
97	140192	x = svlsl_n_u32_x(pg, x, sizeof(ScalarType));
98	140192	svuint32_t offsets = svmla_x(pg, x, y, sv_src_stride);
99		if constexpr (std::is_same<ScalarType, uint8_t>::value) {
100	145592	svuint32_t r32 = svld1uh_gather_offset_u32(
101	72796	pg, reinterpret_cast<const uint16_t *>(&src_rows[0]), offsets);
102	145592	return svreinterpret_u32_u8(
103	72796	svtbl_u8(svreinterpret_u8_u32(r32), load_table));
104	72796	}
105		if constexpr (std::is_same<ScalarType, uint16_t>::value) {
106	134792	return svld1_gather_u32offset_u32(
107	67396	pg, reinterpret_cast<const uint32_t *>(&src_rows[0]), offsets);
108		}
109	140192	}
110		}
111
112		template <typename ScalarType, bool IsLarge, size_t Channels>
113	247686	svuint32_t inline calculate_linear_replicated_border(
114		svbool_t pg, svfloat32x2_t coords, svfloat32_t xmaxf, svfloat32_t ymaxf,
115		svuint32_t sv_src_stride, Rows<const ScalarType> &src_rows,
116		svuint8_t load_table_2ch) {
117	247686	svbool_t pg_all32 = svptrue_b32();
118
119	1238430	auto load_source = [&](svuint32_t x, svuint32_t y) {
120		if constexpr (Channels == 1) {
121	920824	return load_xy<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows);
122		}
123		if constexpr (Channels == 2) {
124	139840	return load_xy_2ch<ScalarType, IsLarge>(pg, x, y, sv_src_stride, src_rows,
125	69920	load_table_2ch);
126		}
127		};
128
129	247686	svfloat32_t xf = svget2(coords, 0);
130	247686	svfloat32_t yf = svget2(coords, 1);
131		// Take the integer part, clamp it to within the dimensions of the
132		// source image (negative values are already saturated to 0)
133	247686	svuint32_t x0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, xf, xmaxf));
134	247686	svuint32_t y0 = svcvt_u32_f32_x(pg_all32, svmin_x(pg_all32, yf, ymaxf));
135
136		// Get fractional part, or 0 if out of range
137	495372	svbool_t x_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, xf, 0.F),
138	247686	svcmplt_f32(pg_all32, xf, xmaxf));
139	495372	svbool_t y_in_range = svand_z(pg_all32, svcmpge_n_f32(pg_all32, yf, 0.F),
140	247686	svcmplt_f32(pg_all32, yf, ymaxf));
141	495372	svfloat32_t xfrac =
142	495372	svsel_f32(x_in_range, svsub_f32_x(pg_all32, xf, svrintm_x(pg_all32, xf)),
143	247686	svdup_n_f32(0.F));
144	495372	svfloat32_t yfrac =
145	495372	svsel_f32(y_in_range, svsub_f32_x(pg_all32, yf, svrintm_x(pg_all32, yf)),
146	247686	svdup_n_f32(0.F));
147
148		// x1 = x0 + 1, except if it's already xmax or out of range
149	247686	svuint32_t x1 = svsel_u32(x_in_range, svadd_n_u32_x(pg_all32, x0, 1), x0);
150	247686	svuint32_t y1 = svsel_u32(y_in_range, svadd_n_u32_x(pg_all32, y0, 1), y0);
151
152	512852	auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
153		svuint32_t di) {
154	265166	svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
155	265166	svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
156	530332	svfloat32_t line0 =
157	265166	svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
158	265166	svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
159	265166	svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
160	530332	svfloat32_t line1 =
161	265166	svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
162	530332	svfloat32_t result = svmla_f32_x(
163	265166	pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
164	530332	return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
165	265166	};
166
167		// Calculate offsets from coordinates (y * stride + x)
168		// a: top left, b: top right, c: bottom left, d: bottom right
169	247686	svuint32_t a = load_source(x0, y0);
170	247686	svuint32_t b = load_source(x1, y0);
171	247686	svuint32_t c = load_source(x0, y1);
172	247686	svuint32_t d = load_source(x1, y1);
173		if constexpr (Channels == 1) {
174	460412	return lerp_2d(a, b, c, d);
175		}
176		if constexpr (Channels == 2) {
177		// Channel 0
178	34960	svuint32_t res32_0 = lerp_2d(
179	17480	svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
180	17480	svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
181		// Channel 1
182	34960	svuint32_t res32_1 = lerp_2d(
183	17480	svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
184	17480	svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
185
186	52440	return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
187	17480	svreinterpret_u16_u32(res32_1)));
188	17480	}
189	247686	}
190
191		template <typename ScalarType, bool IsLarge>
192	473152	svuint32_t get_pixels_or_border(svbool_t pg, svuint32_t x, svuint32_t y,
193		svuint32_t sv_border, svuint32_t sv_xmax,
194		svuint32_t sv_ymax, svuint32_t sv_src_stride,
195		Rows<const ScalarType> &src_rows) {
196	946304	svbool_t in_range =
197	473152	svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
198	946304	svuint32_t result =
199	473152	load_xy<ScalarType, IsLarge>(in_range, x, y, sv_src_stride, src_rows);
200		// Select between source pixels and border colour
201	946304	return svsel_u32(in_range, result, sv_border);
202	473152	}
203
204		template <typename ScalarType, bool IsLarge>
205	69920	svuint32_t get_pixels_or_border_2ch(svbool_t pg, svuint32_t x, svuint32_t y,
206		svuint32_t sv_border, svuint32_t sv_xmax,
207		svuint32_t sv_ymax,
208		svuint32_t sv_src_stride,
209		Rows<const ScalarType> &src_rows,
210		svuint8_t load_table) {
211	139840	svbool_t in_range =
212	69920	svand_b_z(pg, svcmple_u32(pg, x, sv_xmax), svcmple_u32(pg, y, sv_ymax));
213	139840	svuint32_t result = load_xy_2ch<ScalarType, IsLarge>(
214	69920	in_range, x, y, sv_src_stride, src_rows, load_table);
215		// Select between source pixels and border colour
216	139840	return svsel_u32(in_range, result, sv_border);
217	69920	}
218
219		template <typename ScalarType, bool IsLarge, size_t Channels>
220	135768	svuint32_t inline calculate_linear_constant_border(
221		svbool_t pg, svfloat32x2_t coords, svuint32_t sv_border, svuint32_t sv_xmax,
222		svuint32_t sv_ymax, svuint32_t sv_src_stride,
223		Rows<const ScalarType> &src_rows, svuint8_t load_table_2ch) {
224	135768	svbool_t pg_all32 = svptrue_b32();
225
226	678840	auto load_source = [&](svuint32_t x, svuint32_t y) {
227		if constexpr (Channels == 1) {
228	473152	return get_pixels_or_border<ScalarType, IsLarge>(
229	473152	pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows);
230		}
231		if constexpr (Channels == 2) {
232	69920	return get_pixels_or_border_2ch<ScalarType, IsLarge>(
233	69920	pg, x, y, sv_border, sv_xmax, sv_ymax, sv_src_stride, src_rows,
234	69920	load_table_2ch);
235		}
236		};
237
238		// Convert coordinates to integers, truncating towards minus infinity.
239		// Negative numbers will become large positive numbers.
240		// Since the source width and height is known to be <=2^24 these large
241		// positive numbers will always be treated as outside the source image
242		// bounds.
243	135768	svuint32_t x0, y0, x1, y1;
244	135768	svfloat32_t xfrac, yfrac;
245		{
246	135768	svfloat32_t xf = svget2(coords, 0);
247	135768	svfloat32_t yf = svget2(coords, 1);
248	135768	svfloat32_t xf0 = svrintm_f32_x(pg, xf);
249	135768	svfloat32_t yf0 = svrintm_f32_x(pg, yf);
250	135768	x0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, xf0));
251	135768	y0 = svreinterpret_u32_s32(svcvt_s32_f32_x(pg, yf0));
252	135768	x1 = svadd_u32_x(pg, x0, svdup_n_u32(1));
253	135768	y1 = svadd_u32_x(pg, y0, svdup_n_u32(1));
254
255	135768	xfrac = svsub_f32_x(pg, xf, xf0);
256	135768	yfrac = svsub_f32_x(pg, yf, yf0);
257	135768	}
258
259	289016	auto lerp_2d = [&](svuint32_t ai, svuint32_t bi, svuint32_t ci,
260		svuint32_t di) {
261	153248	svfloat32_t a = svcvt_f32_u32_x(pg_all32, ai);
262	153248	svfloat32_t b = svcvt_f32_u32_x(pg_all32, bi);
263	306496	svfloat32_t line0 =
264	153248	svmla_f32_x(pg_all32, a, svsub_f32_x(pg_all32, b, a), xfrac);
265	153248	svfloat32_t c = svcvt_f32_u32_x(pg_all32, ci);
266	153248	svfloat32_t d = svcvt_f32_u32_x(pg_all32, di);
267	306496	svfloat32_t line1 =
268	153248	svmla_f32_x(pg_all32, c, svsub_f32_x(pg_all32, d, c), xfrac);
269	306496	svfloat32_t result = svmla_f32_x(
270	153248	pg_all32, line0, svsub_f32_x(pg_all32, line1, line0), yfrac);
271	306496	return svcvt_u32_f32_x(pg_all32, svadd_n_f32_x(pg_all32, result, 0.5F));
272	153248	};
273
274		// Calculate offsets from coordinates (y * stride + x)
275		// a: top left, b: top right, c: bottom left, d: bottom right
276	135768	svuint32_t a = load_source(x0, y0);
277	135768	svuint32_t b = load_source(x1, y0);
278	135768	svuint32_t c = load_source(x0, y1);
279	135768	svuint32_t d = load_source(x1, y1);
280		if constexpr (Channels == 1) {
281	236576	return lerp_2d(a, b, c, d);
282		}
283		if constexpr (Channels == 2) {
284		// Channel 0
285	34960	svuint32_t res32_0 = lerp_2d(
286	17480	svmovlb(svreinterpret_u16_u32(a)), svmovlb(svreinterpret_u16_u32(b)),
287	17480	svmovlb(svreinterpret_u16_u32(c)), svmovlb(svreinterpret_u16_u32(d)));
288		// Channel 1
289	34960	svuint32_t res32_1 = lerp_2d(
290	17480	svmovlt(svreinterpret_u16_u32(a)), svmovlt(svreinterpret_u16_u32(b)),
291	17480	svmovlt(svreinterpret_u16_u32(c)), svmovlt(svreinterpret_u16_u32(d)));
292
293	52440	return svreinterpret_u32_u16(svtrn1_u16(svreinterpret_u16_u32(res32_0),
294	17480	svreinterpret_u16_u32(res32_1)));
295	17480	}
296	135768	}
297
298		} // namespace kleidicv::sve2
299