KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/resize/resize_linear_sc.h
Date:	2025-09-25 14:13:34
	Exec	Total	Coverage
Lines:	842	842	100.0%
Functions:	64	70	91.4%
Branches:	129	136	94.9%
  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #ifndef KLEIDICV_RESIZE_LINEAR_SC_H
    
      #define KLEIDICV_RESIZE_LINEAR_SC_H
    
      #include <cassert>
    
      #include "kleidicv/kleidicv.h"
    
      #include "kleidicv/sve2.h"
    
      namespace KLEIDICV_TARGET_NAMESPACE {
    
      162
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_u8_sc(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, uint8_t *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      162
        size_t dst_width = src_width * 2;
    
      162
        size_t dst_height = src_height * 2;
    
      3098
        auto lerp1d_vector = [](svuint8_t near, svuint8_t far) KLEIDICV_STREAMING {
    
          // near * 3
    
      2936
          svuint16_t near3b = svmullb(near, uint8_t{3});
    
      2936
          svuint16_t near3t = svmullt(near, uint8_t{3});
    
          // near * 3 + far
    
      2936
          svuint16_t near3_far_b = svaddwb(near3b, far);
    
      2936
          svuint16_t near3_far_t = svaddwt(near3t, far);
    
          // near * 3 + far + 2
    
      2936
          svuint16_t near3_far_2b = svaddwb(near3_far_b, uint8_t{2});
    
      2936
          svuint16_t near3_far_2t = svaddwt(near3_far_t, uint8_t{2});
    
          // (near * 3 + far + 2) / 4
    
      2936
          svuint8_t near3_far_2_div4 = svshrnb_n_u16(near3_far_2b, 2);
    
      2936
          near3_far_2_div4 = svshrnt_n_u16(near3_far_2_div4, near3_far_2t, 2);
    
      5872
          return near3_far_2_div4;
    
      2936
        };
    
      4370
        auto lerp2d_vector = [](svbool_t pg, svuint8_t near, svuint8_t mid_a,
    
                                svuint8_t mid_b, svuint8_t far) KLEIDICV_STREAMING {
    
          // near * 9
    
      4208
          svuint16_t near9b = svmullb(near, uint8_t{9});
    
      4208
          svuint16_t near9t = svmullt(near, uint8_t{9});
    
          // mid_a + mid_b
    
      4208
          svuint16_t midb = svaddlb(mid_a, mid_b);
    
      4208
          svuint16_t midt = svaddlt(mid_a, mid_b);
    
          // near * 9 + (mid_a + mid_b) * 3
    
      4208
          svuint16_t near9_mid3b = svmla_x(pg, near9b, midb, uint16_t{3});
    
      4208
          svuint16_t near9_mid3t = svmla_x(pg, near9t, midt, uint16_t{3});
    
          // near * 9 + (mid_a + mid_b) * 3 + far
    
      4208
          svuint16_t near9_mid3_far_b = svaddwb(near9_mid3b, far);
    
      4208
          svuint16_t near9_mid3_far_t = svaddwt(near9_mid3t, far);
    
          // near * 9 + (mid_a + mid_b) * 3 + far + 8
    
      4208
          svuint16_t near9_mid3_far_8b = svaddwb(near9_mid3_far_b, uint8_t{8});
    
      4208
          svuint16_t near9_mid3_far_8t = svaddwt(near9_mid3_far_t, uint8_t{8});
    
          // (near * 9 + (mid_a + mid_b) * 3 + far + 8) / 16
    
      4208
          svuint8_t near9_mid3_far_8_div16 = svshrnb_n_u16(near9_mid3_far_8b, 4);
    
      4208
          near9_mid3_far_8_div16 =
    
      4208
              svshrnt_n_u16(near9_mid3_far_8_div16, near9_mid3_far_8t, 4);
    
      8416
          return near9_mid3_far_8_div16;
    
      4208
        };
    
        // Handle top or bottom edge
    
      480
        auto process_edge_row = [src_width, dst_width, lerp1d_vector](
    
                                    const uint8_t *src_row,
    
                                    uint8_t *dst_row) KLEIDICV_STREAMING {
    
          // Left element
    
      318
          dst_row[0] = src_row[0];
    
          // Right element
    
      318
          dst_row[dst_width - 1] = src_row[src_width - 1];
    
        2/2✓ Branch 0 taken 318 times.
✓ Branch 1 taken 580 times.

      898
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
    
      580
            size_t dst_x = src_x * 2 + 1;
    
      580
            svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
    
      580
            svuint8_t src_left = svld1_u8(pg, src_row + src_x);
    
      580
            svuint8_t src_right = svld1_u8(pg, src_row + src_x + 1);
    
      580
            svuint8_t dst_left = lerp1d_vector(src_left, src_right);
    
      580
            svuint8_t dst_right = lerp1d_vector(src_right, src_left);
    
      580
            svst2_u8(pg, dst_row + dst_x, svcreate2(dst_left, dst_right));
    
      580
          }
    
      318
        };
    
      606
        auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
    
                               const uint8_t *src_row0, const uint8_t *src_row1,
    
                               uint8_t *dst_row0,
    
                               uint8_t *dst_row1) KLEIDICV_STREAMING {
    
          // Left elements
    
      444
          svbool_t pg1 = svptrue_pat_b8(SV_VL1);  // read/write 1 element
    
          {
    
      444
            svuint8_t s0 = svld1(pg1, src_row0);
    
      444
            svuint8_t s1 = svld1(pg1, src_row1);
    
      444
            svst1(pg1, dst_row0, lerp1d_vector(s0, s1));
    
      444
            svst1(pg1, dst_row1, lerp1d_vector(s1, s0));
    
      444
          }
    
          // Middle elements
    
        2/2✓ Branch 0 taken 444 times.
✓ Branch 1 taken 1052 times.

      1496
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
    
      1052
            size_t dst_x = src_x * 2 + 1;
    
      1052
            svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
    
      1052
            svuint8_t src_tl = svld1_u8(pg, src_row0 + src_x);
    
      1052
            svuint8_t src_tr = svld1_u8(pg, src_row0 + src_x + 1);
    
      1052
            svuint8_t src_bl = svld1_u8(pg, src_row1 + src_x);
    
      1052
            svuint8_t src_br = svld1_u8(pg, src_row1 + src_x + 1);
    
      1052
            svuint8_t dst_tl = lerp2d_vector(pg, src_tl, src_tr, src_bl, src_br);
    
      1052
            svuint8_t dst_tr = lerp2d_vector(pg, src_tr, src_tl, src_br, src_bl);
    
      1052
            svuint8_t dst_bl = lerp2d_vector(pg, src_bl, src_tl, src_br, src_tr);
    
      1052
            svuint8_t dst_br = lerp2d_vector(pg, src_br, src_tr, src_bl, src_tl);
    
      1052
            svst2_u8(pg, dst_row0 + dst_x, svcreate2(dst_tl, dst_tr));
    
      1052
            svst2_u8(pg, dst_row1 + dst_x, svcreate2(dst_bl, dst_br));
    
      1052
          }
    
          // Right elements
    
      444
          svuint8_t s0 = svld1(pg1, src_row0 + src_width - 1);
    
      444
          svuint8_t s1 = svld1(pg1, src_row1 + src_width - 1);
    
      444
          svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(s0, s1));
    
      444
          svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(s1, s0));
    
      444
        };
    
        // Top row
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 159 times.

      162
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      159
          process_edge_row(src, dst);
    
      159
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 444 times.
✓ Branch 1 taken 162 times.

      606
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      444
          size_t dst_y = src_y * 2 + 1;
    
      444
          const uint8_t *src_row0 = src + src_stride * src_y;
    
      444
          const uint8_t *src_row1 = src_row0 + src_stride;
    
      444
          uint8_t *dst_row0 = dst + dst_stride * dst_y;
    
      444
          uint8_t *dst_row1 = dst_row0 + dst_stride;
    
      444
          process_row(src_row0, src_row1, dst_row0, dst_row1);
    
      444
        }
    
        // Bottom row
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 159 times.

      162
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      318
          process_edge_row(src + src_stride * (src_height - 1),
    
      159
                           dst + dst_stride * (dst_height - 1));
    
      159
        }
    
      162
        return KLEIDICV_OK;
    
      162
      }
    
      102
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_u8_sc(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, uint8_t *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      102
        size_t dst_width = src_width * 4;
    
      102
        size_t dst_height = src_height * 4;
    
      5110
        auto lerp1d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b)
    
                                 KLEIDICV_STREAMING {
    
                                   // bias
    
      5008
                                   svuint16_t top = svdup_u16(4);
    
                                   // bias + a * p
    
      5008
                                   svuint16_t bot = svmlalb(top, a, p);
    
      5008
                                   top = svmlalt(top, a, p);
    
                                   // bias + a * p + b * q
    
      5008
                                   bot = svmlalb(bot, b, q);
    
      5008
                                   top = svmlalt(top, b, q);
    
                                   // (bias + a * p + b * q) / 8
    
      5008
                                   svuint8_t result = svshrnb(bot, 3ULL);
    
      5008
                                   result = svshrnt(result, top, 3ULL);
    
      10016
                                   return result;
    
      5008
                                 };
    
      15974
        auto lerp2d_vector = [](uint8_t p, svuint8_t a, uint8_t q, svuint8_t b,
    
                                uint8_t r, svuint8_t c, uint8_t s,
    
                                svuint8_t d) KLEIDICV_STREAMING {
    
          // bias
    
      15872
          svuint16_t top = svdup_u16(32);
    
          // bias + a * p
    
      15872
          svuint16_t bot = svmlalb(top, a, p);
    
      15872
          top = svmlalt(top, a, p);
    
          // bias + a * p + b * q
    
      15872
          bot = svmlalb(bot, b, q);
    
      15872
          top = svmlalt(top, b, q);
    
          // bias + a * p + b * q + c * r
    
      15872
          bot = svmlalb(bot, c, r);
    
      15872
          top = svmlalt(top, c, r);
    
          // bias + a * p + b * q + c * r + d * s
    
      15872
          bot = svmlalb(bot, d, s);
    
      15872
          top = svmlalt(top, d, s);
    
          // (bias + a * p + b * q + c * r + d * s) / 64
    
      15872
          svuint8_t result = svshrnt(svshrnb(bot, 6ULL), top, 6ULL);
    
      31744
          return result;
    
      15872
        };
    
        // Handle top or bottom edge
    
      300
        auto process_edge_row = [src_width, dst_width, lerp1d_vector](
    
                                    const uint8_t *src_row,
    
                                    uint8_t *dst_row) KLEIDICV_STREAMING {
    
          // Left elements
    
      198
          dst_row[1] = dst_row[0] = src_row[0];
    
          // Right elements
    
      198
          dst_row[dst_width - 1] = dst_row[dst_width - 2] = src_row[src_width - 1];
    
          // Middle elements
    
        2/2✓ Branch 0 taken 198 times.
✓ Branch 1 taken 484 times.

      682
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
    
      484
            size_t dst_x = src_x * 4 + 2;
    
      484
            svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
    
      484
            svuint8_t a = svld1_u8(pg, src_row + src_x);
    
      484
            svuint8_t b = svld1_u8(pg, src_row + src_x + 1);
    
      968
            svst4_u8(pg, dst_row + dst_x,
    
      968
                     svcreate4(lerp1d_vector(7, a, 1, b), lerp1d_vector(5, a, 3, b),
    
      484
                               lerp1d_vector(3, a, 5, b), lerp1d_vector(1, a, 7, b)));
    
      484
          }
    
      198
        };
    
      486
        auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
    
                               const uint8_t *src_row0, const uint8_t *src_row1,
    
                               uint8_t *dst_row0, uint8_t *dst_row1,
    
                               uint8_t *dst_row2,
    
                               uint8_t *dst_row3) KLEIDICV_STREAMING {
    
          // Left elements
    
      384
          svbool_t pg1 = svptrue_pat_b8(SV_VL1);  // read 1 element
    
      384
          svbool_t pg2 = svptrue_pat_b8(SV_VL2);  // write 2 elements
    
          {
    
      384
            svuint8_t s0 = svdup_lane(svld1(pg1, src_row0), 0);
    
      384
            svuint8_t s1 = svdup_lane(svld1(pg1, src_row1), 0);
    
      384
            svst1(pg2, dst_row0, lerp1d_vector(7, s0, 1, s1));
    
      384
            svst1(pg2, dst_row1, lerp1d_vector(5, s0, 3, s1));
    
      384
            svst1(pg2, dst_row2, lerp1d_vector(3, s0, 5, s1));
    
      384
            svst1(pg2, dst_row3, lerp1d_vector(1, s0, 7, s1));
    
      384
          }
    
          // Middle elements
    
        2/2✓ Branch 0 taken 384 times.
✓ Branch 1 taken 992 times.

      1376
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntb()) {
    
      992
            size_t dst_x = src_x * 4 + 2;
    
      992
            svbool_t pg = svwhilelt_b8_u64(src_x + 1, src_width);
    
      992
            svuint8_t a = svld1_u8(pg, src_row0 + src_x);
    
      992
            svuint8_t b = svld1_u8(pg, src_row0 + src_x + 1);
    
      992
            svuint8_t c = svld1_u8(pg, src_row1 + src_x);
    
      992
            svuint8_t d = svld1_u8(pg, src_row1 + src_x + 1);
    
      1984
            svst4_u8(pg, dst_row0 + dst_x,
    
      1984
                     (svcreate4(lerp2d_vector(49, a, 7, b, 7, c, 1, d),
    
      992
                                lerp2d_vector(35, a, 21, b, 5, c, 3, d),
    
      992
                                lerp2d_vector(21, a, 35, b, 3, c, 5, d),
    
      992
                                lerp2d_vector(49, b, 7, a, 7, d, 1, c))));
    
      1984
            svst4_u8(pg, dst_row1 + dst_x,
    
      1984
                     (svcreate4(lerp2d_vector(35, a, 5, b, 21, c, 3, d),
    
      992
                                lerp2d_vector(25, a, 15, b, 15, c, 9, d),
    
      992
                                lerp2d_vector(15, a, 25, b, 9, c, 15, d),
    
      992
                                lerp2d_vector(5, a, 35, b, 3, c, 21, d))));
    
      1984
            svst4_u8(pg, dst_row2 + dst_x,
    
      1984
                     (svcreate4(lerp2d_vector(21, a, 3, b, 35, c, 5, d),
    
      992
                                lerp2d_vector(15, a, 9, b, 25, c, 15, d),
    
      992
                                lerp2d_vector(9, a, 15, b, 15, c, 25, d),
    
      992
                                lerp2d_vector(3, a, 21, b, 5, c, 35, d))));
    
      1984
            svst4_u8(pg, dst_row3 + dst_x,
    
      1984
                     (svcreate4(lerp2d_vector(49, c, 7, a, 7, d, 1, b),
    
      992
                                lerp2d_vector(5, a, 3, b, 35, c, 21, d),
    
      992
                                lerp2d_vector(3, a, 5, b, 21, c, 35, d),
    
      992
                                lerp2d_vector(49, d, 7, b, 7, c, 1, a))));
    
      992
          }
    
          // Right elements
    
      384
          svuint8_t s0 = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
    
      384
          svuint8_t s1 = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
    
      384
          svst1(pg2, dst_row0 + dst_width - 2, lerp1d_vector(7, s0, 1, s1));
    
      384
          svst1(pg2, dst_row1 + dst_width - 2, lerp1d_vector(5, s0, 3, s1));
    
      384
          svst1(pg2, dst_row2 + dst_width - 2, lerp1d_vector(3, s0, 5, s1));
    
      384
          svst1(pg2, dst_row3 + dst_width - 2, lerp1d_vector(1, s0, 7, s1));
    
      384
        };
    
      300
        auto copy_dst_row = [src_width](const uint8_t *dst_from,
    
                                        uint8_t *dst_to) KLEIDICV_STREAMING {
    
        2/2✓ Branch 0 taken 198 times.
✓ Branch 1 taken 550 times.

      748
          for (size_t i = 0; i < src_width; i += svcntb()) {
    
      550
            svbool_t pg = svwhilelt_b8_u64(i, src_width);
    
      550
            svst4(pg, dst_to + i * 4, svld4(pg, dst_from + i * 4));
    
      550
          }
    
      198
        };
    
        // Top rows
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.

      102
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      99
          process_edge_row(src, dst);
    
      99
          copy_dst_row(dst, dst + dst_stride);
    
      99
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 384 times.
✓ Branch 1 taken 102 times.

      486
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      384
          size_t dst_y = src_y * 4 + 2;
    
      384
          const uint8_t *src_row0 = src + src_stride * src_y;
    
      384
          const uint8_t *src_row1 = src_row0 + src_stride;
    
      384
          uint8_t *dst_row0 = dst + dst_stride * dst_y;
    
      384
          uint8_t *dst_row1 = dst_row0 + dst_stride;
    
      384
          uint8_t *dst_row2 = dst_row1 + dst_stride;
    
      384
          uint8_t *dst_row3 = dst_row2 + dst_stride;
    
      384
          process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
    
      384
        }
    
        // Bottom rows
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 99 times.

      102
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      198
          process_edge_row(src + src_stride * (src_height - 1),
    
      99
                           dst + dst_stride * (dst_height - 2));
    
      198
          copy_dst_row(dst + dst_stride * (dst_height - 2),
    
      99
                       dst + dst_stride * (dst_height - 1));
    
      99
        }
    
      102
        return KLEIDICV_OK;
    
      102
      }
    
      150
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_2x2_f32_sc(
    
          const float *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, float *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      150
        size_t dst_width = src_width * 2;
    
      150
        src_stride /= sizeof(float);
    
      150
        dst_stride /= sizeof(float);
    
      5206
        auto lerp1d_vector = [](svbool_t pg, svfloat32_t near,
    
                                svfloat32_t far) KLEIDICV_STREAMING {
    
      5056
          return svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.75F), far, 0.25F);
    
        };
    
      13102
        auto lerp2d_vector = [](svbool_t pg, svfloat32_t near, svfloat32_t mid_a,
    
                                svfloat32_t mid_b,
    
                                svfloat32_t far) KLEIDICV_STREAMING {
    
      12952
          return svmla_n_f32_x(
    
      12952
              pg,
    
      12952
              svmla_n_f32_x(
    
      12952
                  pg,
    
      12952
                  svmla_n_f32_x(pg, svmul_n_f32_x(pg, near, 0.5625F), mid_a, 0.1875F),
    
      12952
                  mid_b, 0.1875F),
    
      12952
              far, 0.0625F);
    
        };
    
        // Handle top or bottom edge
    
      444
        auto process_edge_row = [src_width, dst_width, lerp1d_vector](
    
                                    const float *src_row,
    
                                    float *dst_row) KLEIDICV_STREAMING {
    
          // Left element
    
      294
          dst_row[0] = src_row[0];
    
          // Middle elements
    
        2/2✓ Branch 0 taken 294 times.
✓ Branch 1 taken 1640 times.

      1934
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
    
      1640
            size_t dst_x = src_x * 2 + 1;
    
      1640
            svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
    
      1640
            svfloat32_t a = svld1_f32(pg, src_row + src_x);
    
      1640
            svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
    
      3280
            svst2_f32(pg, dst_row + dst_x,
    
      1640
                      svcreate2(lerp1d_vector(pg, a, b), lerp1d_vector(pg, b, a)));
    
      1640
          }
    
          // Right element
    
      294
          dst_row[dst_width - 1] = src_row[src_width - 1];
    
      294
        };
    
      594
        auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
    
                               const float *src_row0, const float *src_row1,
    
                               float *dst_row0, float *dst_row1) KLEIDICV_STREAMING {
    
          // Left elements
    
      444
          svbool_t pg1 = svptrue_pat_b32(SV_VL1);  // read/write 1 element
    
          {
    
      444
            svfloat32_t s0 = svld1(pg1, src_row0);
    
      444
            svfloat32_t s1 = svld1(pg1, src_row1);
    
      444
            svst1(pg1, dst_row0, lerp1d_vector(pg1, s0, s1));
    
      444
            svst1(pg1, dst_row1, lerp1d_vector(pg1, s1, s0));
    
      444
          }
    
          // Middle elements
    
        2/2✓ Branch 0 taken 444 times.
✓ Branch 1 taken 3238 times.

      3682
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
    
      3238
            size_t dst_x = src_x * 2 + 1;
    
      3238
            svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
    
      3238
            svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
    
      3238
            svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
    
      3238
            svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
    
      3238
            svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
    
      6476
            svst2_f32(pg, dst_row0 + dst_x,
    
      6476
                      svcreate2(lerp2d_vector(pg, a, b, c, d),
    
      3238
                                lerp2d_vector(pg, b, a, d, c)));
    
      6476
            svst2_f32(pg, dst_row1 + dst_x,
    
      6476
                      svcreate2(lerp2d_vector(pg, c, a, d, b),
    
      3238
                                lerp2d_vector(pg, d, b, c, a)));
    
      3238
          }
    
          // Right elements
    
      444
          svfloat32_t s0 = svld1(pg1, src_row0 + src_width - 1);
    
      444
          svfloat32_t s1 = svld1(pg1, src_row1 + src_width - 1);
    
      444
          svst1(pg1, dst_row0 + dst_width - 1, lerp1d_vector(pg1, s0, s1));
    
      444
          svst1(pg1, dst_row1 + dst_width - 1, lerp1d_vector(pg1, s1, s0));
    
      444
        };
    
        // Top row
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.

      150
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      147
          process_edge_row(src, dst);
    
      147
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 444 times.
✓ Branch 1 taken 150 times.

      594
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      444
          size_t dst_y = src_y * 2 + 1;
    
      444
          const float *src_row0 = src + src_stride * src_y;
    
      444
          const float *src_row1 = src_row0 + src_stride;
    
      444
          float *dst_row0 = dst + dst_stride * dst_y;
    
      444
          float *dst_row1 = dst_row0 + dst_stride;
    
      444
          process_row(src_row0, src_row1, dst_row0, dst_row1);
    
      444
        }
    
        // Bottom row
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 147 times.

      150
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      294
          process_edge_row(src + src_stride * (src_height - 1),
    
      147
                           dst + dst_stride * (src_height * 2 - 1));
    
      147
        }
    
      150
        return KLEIDICV_OK;
    
      150
      }
    
      114
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_4x4_f32_sc(
    
          const float *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, float *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      114
        size_t dst_width = src_width * 4;
    
      114
        size_t dst_height = src_height * 4;
    
      114
        src_stride /= sizeof(float);
    
      114
        dst_stride /= sizeof(float);
    
      35170
        auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
    
                                svfloat32_t b) KLEIDICV_STREAMING {
    
      35056
          return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
    
        };
    
      25634
        auto lerp2d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
    
                                svfloat32_t b, float r, svfloat32_t c, float s,
    
                                svfloat32_t d) KLEIDICV_STREAMING {
    
      25520
          return svmla_n_f32_x(
    
      25520
              pg,
    
      51040
              svmla_n_f32_x(pg, svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q), c,
    
      25520
                            r),
    
      25520
              d, s);
    
        };
    
        // Handle top or bottom edge
    
      336
        auto process_edge_row = [src_width, dst_width, dst_stride, lerp1d_vector](
    
                                    const float *src_row,
    
                                    float *dst_row) KLEIDICV_STREAMING {
    
          // Left elements
    
      222
          dst_row[1] = dst_row[0] = dst_row[dst_stride + 1] = dst_row[dst_stride] =
    
      222
              src_row[0];
    
          // Right elements
    
      222
          dst_row[dst_width - 1] = dst_row[dst_width - 2] =
    
      222
              dst_row[dst_stride + dst_width - 1] =
    
      222
                  dst_row[dst_stride + dst_width - 2] = src_row[src_width - 1];
    
          // Middle elements
    
        2/2✓ Branch 0 taken 222 times.
✓ Branch 1 taken 1592 times.

      1814
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
    
      1592
            size_t dst_x = src_x * 4 + 2;
    
      1592
            svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
    
      1592
            svfloat32_t a = svld1_f32(pg, src_row + src_x);
    
      1592
            svfloat32_t b = svld1_f32(pg, src_row + src_x + 1);
    
      3184
            svfloat32x4_t result = svcreate4(lerp1d_vector(pg, 0.875F, a, 0.125F, b),
    
      1592
                                             lerp1d_vector(pg, 0.625F, a, 0.375F, b),
    
      1592
                                             lerp1d_vector(pg, 0.375F, a, 0.625F, b),
    
      1592
                                             lerp1d_vector(pg, 0.125F, a, 0.875F, b));
    
      1592
            svst4_f32(pg, dst_row + dst_x, result);
    
      1592
            svst4_f32(pg, dst_row + dst_stride + dst_x, result);
    
      1592
          }
    
      222
        };
    
      510
        auto process_row = [src_width, dst_width, lerp1d_vector, lerp2d_vector](
    
                               const float *src_row0, const float *src_row1,
    
                               float *dst_row0, float *dst_row1, float *dst_row2,
    
                               float *dst_row3) KLEIDICV_STREAMING {
    
          // Left elements
    
      396
          svbool_t pg1 = svptrue_pat_b32(SV_VL1);  // read 1 element
    
      396
          svbool_t pg2 = svptrue_pat_b32(SV_VL2);  // write 2 elements
    
      396
          svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
    
      396
          svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
    
      396
          svst1(pg2, dst_row0, lerp1d_vector(pg2, 0.875F, s0l, 0.125F, s1l));
    
      396
          svst1(pg2, dst_row1, lerp1d_vector(pg2, 0.625F, s0l, 0.375F, s1l));
    
      396
          svst1(pg2, dst_row2, lerp1d_vector(pg2, 0.375F, s0l, 0.625F, s1l));
    
      396
          svst1(pg2, dst_row3, lerp1d_vector(pg2, 0.125F, s0l, 0.875F, s1l));
    
          // Middle elements
    
        2/2✓ Branch 0 taken 396 times.
✓ Branch 1 taken 3190 times.

      3586
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw()) {
    
      3190
            size_t dst_x = src_x * 4 + 2;
    
      3190
            svbool_t pg = svwhilelt_b32_u64(src_x + 1, src_width);
    
      3190
            svfloat32_t a = svld1_f32(pg, src_row0 + src_x);
    
      3190
            svfloat32_t b = svld1_f32(pg, src_row0 + src_x + 1);
    
      3190
            svfloat32_t c = svld1_f32(pg, src_row1 + src_x);
    
      3190
            svfloat32_t d = svld1_f32(pg, src_row1 + src_x + 1);
    
      6380
            svfloat32x4_t dst_a =
    
      9570
                svcreate4(lerp2d_vector(pg, 0.765625F, a, 0.109375F, b, 0.109375F, c,
    
      3190
                                        0.015625F, d),
    
      6380
                          lerp2d_vector(pg, 0.546875F, a, 0.328125F, b, 0.078125F, c,
    
      3190
                                        0.046875F, d),
    
      6380
                          lerp2d_vector(pg, 0.328125F, a, 0.546875F, b, 0.046875F, c,
    
      3190
                                        0.078125F, d),
    
      6380
                          lerp2d_vector(pg, 0.109375F, a, 0.765625F, b, 0.015625F, c,
    
      3190
                                        0.109375F, d));
    
      6380
            svfloat32x4_t dst_d =
    
      9570
                svcreate4(lerp2d_vector(pg, 0.109375F, a, 0.015625F, b, 0.765625F, c,
    
      3190
                                        0.109375F, d),
    
      6380
                          lerp2d_vector(pg, 0.078125F, a, 0.046875F, b, 0.546875F, c,
    
      3190
                                        0.328125F, d),
    
      6380
                          lerp2d_vector(pg, 0.046875F, a, 0.078125F, b, 0.328125F, c,
    
      3190
                                        0.546875F, d),
    
      6380
                          lerp2d_vector(pg, 0.015625F, a, 0.109375F, b, 0.109375F, c,
    
      3190
                                        0.765625F, d));
    
      3190
            const float one_3rd = 0.3333333333333333F;
    
      3190
            const float two_3rd = 0.6666666666666667F;
    
      3190
            svst4_f32(pg, dst_row0 + dst_x, dst_a);
    
      6380
            svst4_f32(pg, dst_row1 + dst_x,
    
      9570
                      svcreate4(lerp1d_vector(pg, two_3rd, svget4(dst_a, 0), one_3rd,
    
      3190
                                              svget4(dst_d, 0)),
    
      6380
                                lerp1d_vector(pg, two_3rd, svget4(dst_a, 1), one_3rd,
    
      3190
                                              svget4(dst_d, 1)),
    
      6380
                                lerp1d_vector(pg, two_3rd, svget4(dst_a, 2), one_3rd,
    
      3190
                                              svget4(dst_d, 2)),
    
      6380
                                lerp1d_vector(pg, two_3rd, svget4(dst_a, 3), one_3rd,
    
      3190
                                              svget4(dst_d, 3))));
    
      6380
            svst4_f32(pg, dst_row2 + dst_x,
    
      9570
                      svcreate4(lerp1d_vector(pg, one_3rd, svget4(dst_a, 0), two_3rd,
    
      3190
                                              svget4(dst_d, 0)),
    
      6380
                                lerp1d_vector(pg, one_3rd, svget4(dst_a, 1), two_3rd,
    
      3190
                                              svget4(dst_d, 1)),
    
      6380
                                lerp1d_vector(pg, one_3rd, svget4(dst_a, 2), two_3rd,
    
      3190
                                              svget4(dst_d, 2)),
    
      6380
                                lerp1d_vector(pg, one_3rd, svget4(dst_a, 3), two_3rd,
    
      3190
                                              svget4(dst_d, 3))));
    
      3190
            svst4_f32(pg, dst_row3 + dst_x, dst_d);
    
      3190
          }
    
          // Right elements
    
      396
          svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
    
      396
          svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
    
      792
          svst1(pg2, dst_row0 + dst_width - 2,
    
      396
                lerp1d_vector(pg2, 0.875F, s0r, 0.125F, s1r));
    
      792
          svst1(pg2, dst_row1 + dst_width - 2,
    
      396
                lerp1d_vector(pg2, 0.625F, s0r, 0.375F, s1r));
    
      792
          svst1(pg2, dst_row2 + dst_width - 2,
    
      396
                lerp1d_vector(pg2, 0.375F, s0r, 0.625F, s1r));
    
      792
          svst1(pg2, dst_row3 + dst_width - 2,
    
      396
                lerp1d_vector(pg2, 0.125F, s0r, 0.875F, s1r));
    
      396
        };
    
        // Top rows
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.

      114
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      111
          process_edge_row(src, dst);
    
      111
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 396 times.
✓ Branch 1 taken 114 times.

      510
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      396
          size_t dst_y = src_y * 4 + 2;
    
      396
          const float *src_row0 = src + src_stride * src_y;
    
      396
          const float *src_row1 = src_row0 + src_stride;
    
      396
          float *dst_row0 = dst + dst_stride * dst_y;
    
      396
          float *dst_row1 = dst_row0 + dst_stride;
    
      396
          float *dst_row2 = dst_row1 + dst_stride;
    
      396
          float *dst_row3 = dst_row2 + dst_stride;
    
      396
          process_row(src_row0, src_row1, dst_row0, dst_row1, dst_row2, dst_row3);
    
      396
        }
    
        // Bottom rows
    
        2/2✓ Branch 0 taken 3 times.
✓ Branch 1 taken 111 times.

      114
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      222
          process_edge_row(src + src_stride * (src_height - 1),
    
      111
                           dst + dst_stride * (dst_height - 2));
    
      111
        }
    
      114
        return KLEIDICV_OK;
    
      114
      }
    
      34
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve128_sc(
    
          const float *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, float *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      34
        size_t dst_width = src_width * 8;
    
      34
        size_t dst_height = src_height * 8;
    
      34
        src_stride /= sizeof(float);
    
      34
        dst_stride /= sizeof(float);
    
      34
        float coeffs_a[] = {15 / 16.0, 13 / 16.0, 11 / 16.0, 9 / 16.0,
    
                            7 / 16.0,  5 / 16.0,  3 / 16.0,  1 / 16.0};
    
      34
        float coeffs_b[] = {1 / 16.0, 3 / 16.0,  5 / 16.0,  7 / 16.0,
    
                            9 / 16.0, 11 / 16.0, 13 / 16.0, 15 / 16.0};
    
      34
        svfloat32_t coeffs_a0 = svld1(svptrue_b32(), &coeffs_a[0]);
    
      34
        svfloat32_t coeffs_a1 = svld1(svptrue_b32(), &coeffs_a[4]);
    
      34
        svfloat32_t coeffs_b0 = svld1(svptrue_b32(), &coeffs_b[0]);
    
      34
        svfloat32_t coeffs_b1 = svld1(svptrue_b32(), &coeffs_b[4]);
    
      68
        std::reference_wrapper<svfloat32_t> coeffs_ab[4] = {coeffs_a0, coeffs_a1,
    
      68
                                                            coeffs_b0, coeffs_b1};
    
      111618
        auto lerp1d_vector_n = [](svbool_t pg, float p, svfloat32_t a, float q,
    
                                  svfloat32_t b) KLEIDICV_STREAMING {
    
      111584
          return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
    
        };
    
      8970
        auto lerp1d_vector = [](svbool_t pg, svfloat32_t p, svfloat32_t a,
    
                                svfloat32_t q, svfloat32_t b) KLEIDICV_STREAMING {
    
      8936
          return svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q);
    
        };
    
        // Handle top or bottom edge
    
      34
        auto process_edge_row =
    
      100
            [src_width, dst_width, lerp1d_vector](
    
                const float *src_row, float *dst_row, size_t dst_stride,
    
                std::reference_wrapper<svfloat32_t> coeffs_ab[4]) KLEIDICV_STREAMING {
    
              // Left elements
    
      66
              float left = src_row[0];
    
      66
              float *dst = dst_row;
    
        2/2✓ Branch 0 taken 264 times.
✓ Branch 1 taken 66 times.

      330
              for (size_t i = 0; i < 4; ++i) {
    
      264
                *dst++ = left;
    
      264
                *dst++ = left;
    
      264
                *dst++ = left;
    
      264
                *dst = left;
    
      264
                dst += dst_stride - 3;
    
      264
              }
    
              // Middle elements
    
      66
              svfloat32_t a, b = svdup_n_f32(src_row[0]);
    
        2/2✓ Branch 0 taken 4468 times.
✓ Branch 1 taken 66 times.

      4534
              for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
    
      4468
                a = b;
    
      4468
                b = svdup_n_f32(src_row[src_x + 1]);
    
      4468
                float *dst_row0 = dst_row + src_x * 8 + 4;
    
      4468
                float *dst_row1 = dst_row0 + dst_stride;
    
      4468
                float *dst_row2 = dst_row1 + dst_stride;
    
      4468
                float *dst_row3 = dst_row2 + dst_stride;
    
      8936
                svfloat32_t dst =
    
      4468
                    lerp1d_vector(svptrue_b32(), coeffs_ab[0], a, coeffs_ab[2], b);
    
      4468
                svst1(svptrue_b32(), dst_row0, dst);
    
      4468
                svst1(svptrue_b32(), dst_row1, dst);
    
      4468
                svst1(svptrue_b32(), dst_row2, dst);
    
      4468
                svst1(svptrue_b32(), dst_row3, dst);
    
      4468
                dst = lerp1d_vector(svptrue_b32(), coeffs_ab[1], a, coeffs_ab[3], b);
    
      4468
                svst1(svptrue_b32(), dst_row0 + 4, dst);
    
      4468
                svst1(svptrue_b32(), dst_row1 + 4, dst);
    
      4468
                svst1(svptrue_b32(), dst_row2 + 4, dst);
    
      4468
                svst1(svptrue_b32(), dst_row3 + 4, dst);
    
      4468
              }
    
              // Right elements
    
      66
              dst = dst_row + dst_width - 4;
    
      66
              float right = src_row[src_width - 1];
    
        2/2✓ Branch 0 taken 66 times.
✓ Branch 1 taken 264 times.

      330
              for (size_t i = 0; i < 4; ++i) {
    
      264
                *dst++ = right;
    
      264
                *dst++ = right;
    
      264
                *dst++ = right;
    
      264
                *dst = right;
    
      264
                dst += dst_stride - 3;
    
      264
              }
    
      66
            };
    
      34
        svfloat32_t coeffs_p0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 15.0 / 16);
    
      34
        svfloat32_t coeffs_q0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 15.0 / 16);
    
      34
        svfloat32_t coeffs_r0 = svmul_n_f32_x(svptrue_b32(), coeffs_a0, 1.0 / 16);
    
      34
        svfloat32_t coeffs_s0 = svmul_n_f32_x(svptrue_b32(), coeffs_b0, 1.0 / 16);
    
      34
        svfloat32_t coeffs_p1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 15.0 / 16);
    
      34
        svfloat32_t coeffs_q1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 15.0 / 16);
    
      34
        svfloat32_t coeffs_r1 = svmul_n_f32_x(svptrue_b32(), coeffs_a1, 1.0 / 16);
    
      34
        svfloat32_t coeffs_s1 = svmul_n_f32_x(svptrue_b32(), coeffs_b1, 1.0 / 16);
    
      272
        std::reference_wrapper<svfloat32_t> coeffs_pqrs[8] = {
    
      136
            coeffs_p0, coeffs_p1, coeffs_q0, coeffs_q1,
    
      136
            coeffs_r0, coeffs_r1, coeffs_s0, coeffs_s1,
    
        };
    
      36546
        auto lerp2d_vector = [](svbool_t pg, svfloat32_t a, svfloat32_t p,
    
                                svfloat32_t b, svfloat32_t q, svfloat32_t c,
    
                                svfloat32_t r, svfloat32_t d,
    
                                svfloat32_t s) KLEIDICV_STREAMING {
    
      36512
          return svmla_f32_x(
    
      36512
              pg, svmla_f32_x(pg, svmla_f32_x(pg, svmul_f32_x(pg, a, p), b, q), c, r),
    
      36512
              d, s);
    
        };
    
      162
        auto process_row = [src_width, lerp2d_vector, lerp1d_vector_n](
    
                               const float *src_row0, const float *src_row1,
    
                               float *dst_row0, size_t dst_stride,
    
                               std::reference_wrapper<svfloat32_t>
    
                                   coeffs_pqrs[8]) KLEIDICV_STREAMING {
    
          // Left elements
    
      128
          svbool_t pg1 = svptrue_pat_b32(SV_VL1);  // read 1 element
    
      128
          svbool_t pg4 = svptrue_pat_b32(SV_VL4);  // write 4 elements
    
      128
          float *dst_lr = dst_row0;
    
      128
          svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
    
      128
          svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
    
        2/2✓ Branch 0 taken 1024 times.
✓ Branch 1 taken 128 times.

      1152
          for (size_t i = 0; i < 8; ++i) {
    
      2048
            svst1(pg4, dst_lr,
    
      2048
                  lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
    
      1024
                                  static_cast<float>(i * 2 + 1) / 16.0F, s1l));
    
      1024
            dst_lr += dst_stride;
    
      1024
          }
    
          // Middle elements
    
      128
          dst_row0 += 4;
    
      128
          float *dst_row1 = dst_row0 + dst_stride;
    
      128
          float *dst_row2 = dst_row1 + dst_stride;
    
      128
          float *dst_row3 = dst_row2 + dst_stride;
    
      128
          float *dst_row4 = dst_row3 + dst_stride;
    
      128
          float *dst_row5 = dst_row4 + dst_stride;
    
      128
          float *dst_row6 = dst_row5 + dst_stride;
    
      128
          float *dst_row7 = dst_row6 + dst_stride;
    
      128
          svfloat32_t a, b = s0l;
    
      128
          svfloat32_t c, d = s1l;
    
        2/2✓ Branch 0 taken 9128 times.
✓ Branch 1 taken 128 times.

      9256
          for (size_t src_x = 0; src_x + 1 < src_width; src_x++) {
    
      9128
            a = b;
    
      9128
            b = svdup_lane(svld1(pg1, src_row0 + src_x + 1), 0);
    
      9128
            c = d;
    
      9128
            d = svdup_lane(svld1(pg1, src_row1 + src_x + 1), 0);
    
      18256
            svfloat32_t dst_0 =
    
      18256
                lerp2d_vector(svptrue_b32(), coeffs_pqrs[0], a, coeffs_pqrs[2], b,
    
      9128
                              coeffs_pqrs[4], c, coeffs_pqrs[6], d);
    
      9128
            svst1(svptrue_b32(), dst_row0, dst_0);
    
      18256
            svfloat32_t dst_7 =
    
      18256
                lerp2d_vector(svptrue_b32(), coeffs_pqrs[4], a, coeffs_pqrs[6], b,
    
      9128
                              coeffs_pqrs[0], c, coeffs_pqrs[2], d);
    
      9128
            svst1(svptrue_b32(), dst_row7, dst_7);
    
      18256
            svst1(svptrue_b32(), dst_row1,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row2,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row3,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row4,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row5,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row6,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      9128
            dst_row0 += 4;
    
      9128
            dst_row1 += 4;
    
      9128
            dst_row2 += 4;
    
      9128
            dst_row3 += 4;
    
      9128
            dst_row4 += 4;
    
      9128
            dst_row5 += 4;
    
      9128
            dst_row6 += 4;
    
      9128
            dst_row7 += 4;
    
      18256
            dst_0 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[1], a, coeffs_pqrs[3], b,
    
      9128
                                  coeffs_pqrs[5], c, coeffs_pqrs[7], d);
    
      9128
            svst1(svptrue_b32(), dst_row0, dst_0);
    
      18256
            dst_7 = lerp2d_vector(svptrue_b32(), coeffs_pqrs[5], a, coeffs_pqrs[7], b,
    
      9128
                                  coeffs_pqrs[1], c, coeffs_pqrs[3], d);
    
      9128
            svst1(svptrue_b32(), dst_row7, dst_7);
    
      18256
            svst1(svptrue_b32(), dst_row1,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row2,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row3,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row4,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row5,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      18256
            svst1(svptrue_b32(), dst_row6,
    
      9128
                  lerp1d_vector_n(svptrue_b32(), 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      9128
            dst_row0 += 4;
    
      9128
            dst_row1 += 4;
    
      9128
            dst_row2 += 4;
    
      9128
            dst_row3 += 4;
    
      9128
            dst_row4 += 4;
    
      9128
            dst_row5 += 4;
    
      9128
            dst_row6 += 4;
    
      9128
            dst_row7 += 4;
    
      9128
          }
    
          // Right elements
    
      128
          dst_lr = dst_row0;
    
      128
          svfloat32_t s0r = b;
    
      128
          svfloat32_t s1r = d;
    
        2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 1024 times.

      1152
          for (size_t i = 0; i < 8; ++i) {
    
      2048
            svst1(pg4, dst_lr,
    
      2048
                  lerp1d_vector_n(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
    
      1024
                                  static_cast<float>(i * 2 + 1) / 16.0F, s1r));
    
      1024
            dst_lr += dst_stride;
    
      1024
          }
    
      128
        };
    
        // Top rows
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.

      34
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      33
          process_edge_row(src, dst, dst_stride, coeffs_ab);
    
      33
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 128 times.
✓ Branch 1 taken 34 times.

      162
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      128
          size_t dst_y = src_y * 8 + 4;
    
      128
          const float *src_row0 = src + src_stride * src_y;
    
      128
          const float *src_row1 = src_row0 + src_stride;
    
      256
          process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
    
      128
                      coeffs_pqrs);
    
      128
        }
    
        // Bottom rows
    
        2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 33 times.

      34
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      66
          process_edge_row(src + src_stride * (src_height - 1),
    
      33
                           dst + dst_stride * (dst_height - 4), dst_stride,
    
      33
                           coeffs_ab);
    
      33
        }
    
      34
        return KLEIDICV_OK;
    
      34
      }
    
      68
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_8x8_f32_sve256plus_sc(
    
          const float *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, float *dst,
    
          size_t dst_stride) KLEIDICV_STREAMING {
    
      68
        size_t dst_width = src_width * 8;
    
      68
        size_t dst_height = src_height * 8;
    
      68
        src_stride /= sizeof(float);
    
      68
        dst_stride /= sizeof(float);
    
      68
        svuint32_t indices_0a, indices_0b, indices_1a, indices_1b, indices_2a,
    
            indices_2b, indices_3a, indices_3b;
    
        {
    
          // indices for row 0
    
      68
          svuint32_t tmp_2x = svreinterpret_u32_u64(svindex_u64(0, 0x100000001UL));
    
      68
          svuint32_t tmp_4x = svzip1(tmp_2x, tmp_2x);  // 0, 0, 0, 0, 1, 1, 1, 1, ...
    
      68
          indices_0a = svzip1(tmp_4x, tmp_4x);  // 8 times 0, then 8 times 1, ...
    
      68
          indices_1a = svzip2(tmp_4x, tmp_4x);
    
          // next section, e.g. in case of 512-bit regs (=16 x F32), it is 4, 4, 4, 4,
    
          // 5, 5, 5, 5, ...
    
      68
          tmp_4x = svzip2(tmp_2x, tmp_2x);
    
      68
          indices_2a = svzip1(tmp_4x, tmp_4x);
    
      68
          indices_3a = svzip2(tmp_4x, tmp_4x);
    
          // same as above, just all numbers are bigger by one (for row 1)
    
      68
          tmp_2x = svreinterpret_u32_u64(svindex_u64(0x100000001UL, 0x100000001UL));
    
      68
          tmp_4x = svzip1(tmp_2x, tmp_2x);      // 1, 1, 1, 1, ...
    
      68
          indices_0b = svzip1(tmp_4x, tmp_4x);  // 8 times 1, then 8 times 2, ...
    
      68
          indices_1b = svzip2(tmp_4x, tmp_4x);
    
          // next section, e.g. in case of 512-bit regs (=16 x F32), it is 5, 5, 5, 5,
    
          // 6, 6, 6, 6, ...
    
      68
          tmp_4x = svzip2(tmp_2x, tmp_2x);
    
      68
          indices_2b = svzip1(tmp_4x, tmp_4x);
    
      68
          indices_3b = svzip2(tmp_4x, tmp_4x);
    
      68
        }
    
      544
        std::reference_wrapper<svuint32_t> indices[8] = {
    
      272
            indices_0a, indices_0b, indices_1a, indices_1b,
    
      272
            indices_2a, indices_2b, indices_3a, indices_3b};
    
      68
        svfloat32_t coeffs_a, coeffs_b;
    
        {
    
          // Prepare 1/16, 3/16, 5/16, ..., 15/16, repeated
    
      68
          svuint32_t linear = svindex_u32(1, 2);
    
      136
          svfloat32_t repetitive_float =  // mod 16
    
      68
              svcvt_f32_x(svptrue_b32(), svand_n_u32_m(svptrue_b32(), linear, 0x0F));
    
      68
          coeffs_b = svdiv_n_f32_x(svptrue_b32(), repetitive_float, 16.0F);
    
      68
          coeffs_a = svsub_x(svptrue_b32(), svdup_f32(1.0F), coeffs_b);
    
      68
        }
    
      68
        std::reference_wrapper<svfloat32_t> coeffs_ab[2] = {coeffs_a, coeffs_b};
    
      41940
        auto lerp1d_vector = [](svbool_t pg, float p, svfloat32_t a, float q,
    
                                svfloat32_t b) KLEIDICV_STREAMING {
    
      41872
          return svmla_n_f32_x(pg, svmul_n_f32_x(pg, a, p), b, q);
    
        };
    
      3156
        auto index_and_lerp1d = [](svbool_t pg, svuint32_t indices_a,
    
                                   svuint32_t indices_b,
    
                                   std::reference_wrapper<svfloat32_t> coeffs_ab[2],
    
                                   svfloat32_t src) KLEIDICV_STREAMING {
    
      6176
          return svmla_f32_x(pg, svmul_f32_x(pg, svtbl(src, indices_a), coeffs_ab[0]),
    
      3088
                             svtbl(src, indices_b), coeffs_ab[1]);
    
        };
    
        // Handle top or bottom edge
    
      68
        auto process_edge_row =
    
      200
            [src_width, dst_width, index_and_lerp1d](
    
                const float *src_row, float *dst_row, size_t dst_stride,
    
                std::reference_wrapper<svuint32_t> indices[8],
    
                std::reference_wrapper<svfloat32_t> coeffs_ab[2]) KLEIDICV_STREAMING {
    
              // Left elements
    
      132
              float left = src_row[0];
    
      132
              float *dst = dst_row;
    
        2/2✓ Branch 0 taken 528 times.
✓ Branch 1 taken 132 times.

      660
              for (size_t i = 0; i < 4; ++i) {
    
      528
                *dst++ = left;
    
      528
                *dst++ = left;
    
      528
                *dst++ = left;
    
      528
                *dst = left;
    
      528
                dst += dst_stride - 3;
    
      528
              }
    
              // Middle elements
    
        2/2✓ Branch 0 taken 772 times.
✓ Branch 1 taken 132 times.

      904
              for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
    
      772
                svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
    
      772
                svfloat32_t svsrc = svld1_f32(pg, src_row + src_x);
    
      772
                size_t dst_length = 8 * (src_width - src_x - 1);
    
      772
                svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
    
      772
                svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
    
      772
                svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
    
      772
                svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
    
      772
                float *dst_row0 = dst_row + src_x * 8 + 4;
    
      772
                float *dst_row1 = dst_row0 + dst_stride;
    
      772
                float *dst_row2 = dst_row1 + dst_stride;
    
      772
                float *dst_row3 = dst_row2 + dst_stride;
    
      1544
                svfloat32_t dst =
    
      772
                    index_and_lerp1d(pg_1, indices[0], indices[1], coeffs_ab, svsrc);
    
      772
                svst1(pg_1, dst_row0, dst);
    
      772
                svst1(pg_1, dst_row1, dst);
    
      772
                svst1(pg_1, dst_row2, dst);
    
      772
                svst1(pg_1, dst_row3, dst);
    
      772
                dst =
    
      772
                    index_and_lerp1d(pg_2, indices[2], indices[3], coeffs_ab, svsrc);
    
      772
                svst1_vnum(pg_2, dst_row0, 1, dst);
    
      772
                svst1_vnum(pg_2, dst_row1, 1, dst);
    
      772
                svst1_vnum(pg_2, dst_row2, 1, dst);
    
      772
                svst1_vnum(pg_2, dst_row3, 1, dst);
    
      772
                dst =
    
      772
                    index_and_lerp1d(pg_3, indices[4], indices[5], coeffs_ab, svsrc);
    
      772
                svst1_vnum(pg_3, dst_row0, 2, dst);
    
      772
                svst1_vnum(pg_3, dst_row1, 2, dst);
    
      772
                svst1_vnum(pg_3, dst_row2, 2, dst);
    
      772
                svst1_vnum(pg_3, dst_row3, 2, dst);
    
      772
                dst =
    
      772
                    index_and_lerp1d(pg_4, indices[6], indices[7], coeffs_ab, svsrc);
    
      772
                svst1_vnum(pg_4, dst_row0, 3, dst);
    
      772
                svst1_vnum(pg_4, dst_row1, 3, dst);
    
      772
                svst1_vnum(pg_4, dst_row2, 3, dst);
    
      772
                svst1_vnum(pg_4, dst_row3, 3, dst);
    
      772
              }
    
              // Right elements
    
      132
              dst = dst_row + dst_width - 4;
    
      132
              float right = src_row[src_width - 1];
    
        2/2✓ Branch 0 taken 132 times.
✓ Branch 1 taken 528 times.

      660
              for (size_t i = 0; i < 4; ++i) {
    
      528
                *dst++ = right;
    
      528
                *dst++ = right;
    
      528
                *dst++ = right;
    
      528
                *dst = right;
    
      528
                dst += dst_stride - 3;
    
      528
              }
    
      132
            };
    
      68
        svfloat32_t coeffs_p = svmul_n_f32_x(svptrue_b32(), coeffs_a, 15.0 / 16);
    
      68
        svfloat32_t coeffs_q = svmul_n_f32_x(svptrue_b32(), coeffs_b, 15.0 / 16);
    
      68
        svfloat32_t coeffs_r = svmul_n_f32_x(svptrue_b32(), coeffs_a, 1.0 / 16);
    
      68
        svfloat32_t coeffs_s = svmul_n_f32_x(svptrue_b32(), coeffs_b, 1.0 / 16);
    
      136
        std::reference_wrapper<svfloat32_t> coeffs_pqrs[4] = {coeffs_p, coeffs_q,
    
      136
                                                              coeffs_r, coeffs_s};
    
      12660
        auto index_and_lerp2d = [](svbool_t pg, svuint32_t indices_a,
    
                                   svuint32_t indices_b,
    
                                   std::reference_wrapper<svfloat32_t> coeffs_pqrs[4],
    
                                   svfloat32_t src0,
    
                                   svfloat32_t src1) KLEIDICV_STREAMING {
    
      12592
          return svmla_f32_x(
    
      12592
              pg,
    
      12592
              svmla_f32_x(
    
      12592
                  pg,
    
      25184
                  svmla_f32_x(pg,
    
      12592
                              svmul_f32_x(pg, svtbl(src0, indices_a), coeffs_pqrs[0]),
    
      12592
                              svtbl(src0, indices_b), coeffs_pqrs[1]),
    
      12592
                  svtbl(src1, indices_a), coeffs_pqrs[2]),
    
      12592
              svtbl(src1, indices_b), coeffs_pqrs[3]);
    
        };
    
      324
        auto process_row = [src_width, dst_width, index_and_lerp2d, lerp1d_vector](
    
                               const float *src_row0, const float *src_row1,
    
                               float *dst_row, size_t dst_stride,
    
                               std::reference_wrapper<svuint32_t> indices[8],
    
                               std::reference_wrapper<svfloat32_t>
    
                                   coeffs_pqrs[4]) KLEIDICV_STREAMING {
    
          // Left edge
    
      256
          svbool_t pg1 = svptrue_pat_b32(SV_VL1);  // read 1 element
    
      256
          svbool_t pg4 = svptrue_pat_b32(SV_VL4);  // write 4 elements
    
      256
          float *dst_lr = dst_row;
    
      256
          svfloat32_t s0l = svdup_lane(svld1(pg1, src_row0), 0);
    
      256
          svfloat32_t s1l = svdup_lane(svld1(pg1, src_row1), 0);
    
        2/2✓ Branch 0 taken 2048 times.
✓ Branch 1 taken 256 times.

      2304
          for (size_t i = 0; i < 8; ++i) {
    
      4096
            svst1(pg4, dst_lr,
    
      4096
                  lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0l,
    
      2048
                                static_cast<float>(i * 2 + 1) / 16.0F, s1l));
    
      2048
            dst_lr += dst_stride;
    
      2048
          }
    
          // Middle elements
    
        2/2✓ Branch 0 taken 1574 times.
✓ Branch 1 taken 256 times.

      1830
          for (size_t src_x = 0; src_x + 1 < src_width; src_x += svcntw() / 2) {
    
      1574
            size_t dst_x = src_x * 8 + 4;
    
      1574
            svbool_t pg = svwhilelt_b32_u64(src_x, src_width);
    
      1574
            svfloat32_t src_0 = svld1_f32(pg, src_row0 + src_x);
    
      1574
            svfloat32_t src_1 = svld1_f32(pg, src_row1 + src_x);
    
      1574
            size_t dst_length = 8 * (src_width - src_x - 1);
    
      1574
            svbool_t pg_1 = svwhilelt_b32_u64(0UL, dst_length);
    
      1574
            svbool_t pg_2 = svwhilelt_b32_u64(svcntw(), dst_length);
    
      1574
            svbool_t pg_3 = svwhilelt_b32_u64(2 * svcntw(), dst_length);
    
      1574
            svbool_t pg_4 = svwhilelt_b32_u64(3 * svcntw(), dst_length);
    
      1574
            float *dst_row0 = dst_row + dst_x;
    
      1574
            float *dst_row1 = dst_row0 + dst_stride;
    
      1574
            float *dst_row2 = dst_row1 + dst_stride;
    
      1574
            float *dst_row3 = dst_row2 + dst_stride;
    
      1574
            float *dst_row4 = dst_row3 + dst_stride;
    
      1574
            float *dst_row5 = dst_row4 + dst_stride;
    
      1574
            float *dst_row6 = dst_row5 + dst_stride;
    
      1574
            float *dst_row7 = dst_row6 + dst_stride;
    
      3148
            svfloat32_t dst_0 = index_and_lerp2d(pg_1, indices[0], indices[1],
    
      1574
                                                 coeffs_pqrs, src_0, src_1);
    
      1574
            svst1(pg_1, dst_row0, dst_0);
    
      3148
            svfloat32_t dst_7 = index_and_lerp2d(pg_1, indices[0], indices[1],
    
      1574
                                                 coeffs_pqrs, src_1, src_0);
    
      1574
            svst1(pg_1, dst_row7, dst_7);
    
      3148
            svst1(pg_1, dst_row1,
    
      1574
                  lerp1d_vector(pg_1, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      3148
            svst1(pg_1, dst_row2,
    
      1574
                  lerp1d_vector(pg_1, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      3148
            svst1(pg_1, dst_row3,
    
      1574
                  lerp1d_vector(pg_1, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      3148
            svst1(pg_1, dst_row4,
    
      1574
                  lerp1d_vector(pg_1, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      3148
            svst1(pg_1, dst_row5,
    
      1574
                  lerp1d_vector(pg_1, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      3148
            svst1(pg_1, dst_row6,
    
      1574
                  lerp1d_vector(pg_1, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      3148
            dst_0 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_0,
    
      1574
                                     src_1);
    
      1574
            svst1_vnum(pg_2, dst_row0, 1, dst_0);
    
      3148
            dst_7 = index_and_lerp2d(pg_2, indices[2], indices[3], coeffs_pqrs, src_1,
    
      1574
                                     src_0);
    
      1574
            svst1_vnum(pg_2, dst_row7, 1, dst_7);
    
      3148
            svst1_vnum(pg_2, dst_row1, 1,
    
      1574
                       lerp1d_vector(pg_2, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_2, dst_row2, 1,
    
      1574
                       lerp1d_vector(pg_2, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_2, dst_row3, 1,
    
      1574
                       lerp1d_vector(pg_2, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_2, dst_row4, 1,
    
      1574
                       lerp1d_vector(pg_2, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_2, dst_row5, 1,
    
      1574
                       lerp1d_vector(pg_2, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_2, dst_row6, 1,
    
      1574
                       lerp1d_vector(pg_2, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      3148
            dst_0 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_0,
    
      1574
                                     src_1);
    
      1574
            svst1_vnum(pg_3, dst_row0, 2, dst_0);
    
      3148
            dst_7 = index_and_lerp2d(pg_3, indices[4], indices[5], coeffs_pqrs, src_1,
    
      1574
                                     src_0);
    
      1574
            svst1_vnum(pg_3, dst_row7, 2, dst_7);
    
      3148
            svst1_vnum(pg_3, dst_row1, 2,
    
      1574
                       lerp1d_vector(pg_3, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_3, dst_row2, 2,
    
      1574
                       lerp1d_vector(pg_3, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_3, dst_row3, 2,
    
      1574
                       lerp1d_vector(pg_3, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_3, dst_row4, 2,
    
      1574
                       lerp1d_vector(pg_3, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_3, dst_row5, 2,
    
      1574
                       lerp1d_vector(pg_3, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_3, dst_row6, 2,
    
      1574
                       lerp1d_vector(pg_3, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      3148
            dst_0 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_0,
    
      1574
                                     src_1);
    
      1574
            svst1_vnum(pg_4, dst_row0, 3, dst_0);
    
      3148
            dst_7 = index_and_lerp2d(pg_4, indices[6], indices[7], coeffs_pqrs, src_1,
    
      1574
                                     src_0);
    
      1574
            svst1_vnum(pg_4, dst_row7, 3, dst_7);
    
      3148
            svst1_vnum(pg_4, dst_row1, 3,
    
      1574
                       lerp1d_vector(pg_4, 6.0 / 7, dst_0, 1.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_4, dst_row2, 3,
    
      1574
                       lerp1d_vector(pg_4, 5.0 / 7, dst_0, 2.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_4, dst_row3, 3,
    
      1574
                       lerp1d_vector(pg_4, 4.0 / 7, dst_0, 3.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_4, dst_row4, 3,
    
      1574
                       lerp1d_vector(pg_4, 3.0 / 7, dst_0, 4.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_4, dst_row5, 3,
    
      1574
                       lerp1d_vector(pg_4, 2.0 / 7, dst_0, 5.0 / 7, dst_7));
    
      3148
            svst1_vnum(pg_4, dst_row6, 3,
    
      1574
                       lerp1d_vector(pg_4, 1.0 / 7, dst_0, 6.0 / 7, dst_7));
    
      1574
          }
    
          // Right edge
    
      256
          dst_lr = dst_row;
    
      256
          svfloat32_t s0r = svdup_lane(svld1(pg1, src_row0 + src_width - 1), 0);
    
      256
          svfloat32_t s1r = svdup_lane(svld1(pg1, src_row1 + src_width - 1), 0);
    
        2/2✓ Branch 0 taken 256 times.
✓ Branch 1 taken 2048 times.

      2304
          for (size_t i = 0; i < 8; ++i) {
    
      4096
            svst1(pg4, dst_lr + dst_width - 4,
    
      4096
                  lerp1d_vector(pg4, static_cast<float>(15 - i * 2) / 16.0F, s0r,
    
      2048
                                static_cast<float>(i * 2 + 1) / 16.0F, s1r));
    
      2048
            dst_lr += dst_stride;
    
      2048
          }
    
      256
        };
    
        // Top rows
    
        2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 66 times.

      68
        if (KLEIDICV_LIKELY(y_begin == 0)) {
    
      66
          process_edge_row(src, dst, dst_stride, indices, coeffs_ab);
    
      66
        }
    
        // Middle rows
    
        2/2✓ Branch 0 taken 256 times.
✓ Branch 1 taken 68 times.

      324
        for (size_t src_y = y_begin; src_y + 1 < y_end; ++src_y) {
    
      256
          size_t dst_y = src_y * 8 + 4;
    
      256
          const float *src_row0 = src + src_stride * src_y;
    
      256
          const float *src_row1 = src_row0 + src_stride;
    
      512
          process_row(src_row0, src_row1, dst + dst_stride * dst_y, dst_stride,
    
      256
                      indices, coeffs_pqrs);
    
      256
        }
    
        // Bottom rows
    
        2/2✓ Branch 0 taken 2 times.
✓ Branch 1 taken 66 times.

      68
        if (KLEIDICV_LIKELY(y_end == src_height)) {
    
      132
          process_edge_row(src + src_stride * (src_height - 1),
    
      66
                           dst + dst_stride * (dst_height - 4), dst_stride, indices,
    
      66
                           coeffs_ab);
    
      66
        }
    
      68
        return KLEIDICV_OK;
    
      68
      }
    
      285
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_u8_sc(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, uint8_t *dst, size_t dst_stride,
    
          size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 282 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 282 times.

      285
        CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 279 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 279 times.

      282
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 276 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 273 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 273 times.

      279
        CHECK_IMAGE_SIZE(dst_width, dst_height);
    
        4/4✓ Branch 0 taken 267 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 264 times.

      273
        if (src_width == 0 || src_height == 0) {
    
      9
          return KLEIDICV_OK;
    
        }
    
        3/4✓ Branch 0 taken 162 times.
✓ Branch 1 taken 102 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 162 times.

      264
        if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
    
      324
          return resize_2x2_u8_sc(src, src_stride, src_width, src_height, y_begin,
    
      162
                                  y_end, dst, dst_stride);
    
        }
    
        2/4✓ Branch 0 taken 102 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 102 times.

      102
        if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
    
      204
          return resize_4x4_u8_sc(src, src_stride, src_width, src_height, y_begin,
    
      102
                                  y_end, dst, dst_stride);
    
        }
    
        // resize_linear_f32_is_implemented checked the kernel size already.
    
        // GCOVR_EXCL_START
    
        assert(!"resize ratio not implemented");
    
      −
        return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        // GCOVR_EXCL_STOP
    
      285
      }
    
      387
      KLEIDICV_TARGET_FN_ATTRS static kleidicv_error_t resize_linear_stripe_f32_sc(
    
          const float *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t y_begin, size_t y_end, float *dst, size_t dst_stride,
    
          size_t dst_width, size_t dst_height) KLEIDICV_STREAMING {
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 384 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 384 times.

      387
        CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
    
        4/4✓ Branch 0 taken 3 times.
✓ Branch 1 taken 381 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 381 times.

      384
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, dst_height);
    
        6/6✓ Branch 0 taken 3 times.
✓ Branch 1 taken 378 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 375 times.
✓ Branch 4 taken 6 times.
✓ Branch 5 taken 375 times.

      381
        CHECK_IMAGE_SIZE(dst_width, dst_height);
    
        4/4✓ Branch 0 taken 369 times.
✓ Branch 1 taken 6 times.
✓ Branch 2 taken 3 times.
✓ Branch 3 taken 366 times.

      375
        if (src_width == 0 || src_height == 0) {
    
      9
          return KLEIDICV_OK;
    
        }
    
        3/4✓ Branch 0 taken 150 times.
✓ Branch 1 taken 216 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 150 times.

      366
        if (src_width * 2 == dst_width && src_height * 2 == dst_height) {
    
      300
          return resize_2x2_f32_sc(src, src_stride, src_width, src_height, y_begin,
    
      150
                                   y_end, dst, dst_stride);
    
        }
    
        3/4✓ Branch 0 taken 114 times.
✓ Branch 1 taken 102 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 114 times.

      216
        if (src_width * 4 == dst_width && src_height * 4 == dst_height) {
    
      228
          return resize_4x4_f32_sc(src, src_stride, src_width, src_height, y_begin,
    
      114
                                   y_end, dst, dst_stride);
    
        }
    
        2/4✓ Branch 0 taken 102 times.
✗ Branch 1 not taken.
✗ Branch 2 not taken.
✓ Branch 3 taken 102 times.

      102
        if (src_width * 8 == dst_width && src_height * 8 == dst_height) {
    
        2/2✓ Branch 0 taken 68 times.
✓ Branch 1 taken 34 times.

      102
          if (svcntw() >= 8) {
    
      136
            return resize_8x8_f32_sve256plus_sc(src, src_stride, src_width,
    
      68
                                                src_height, y_begin, y_end, dst,
    
      68
                                                dst_stride);
    
          }
    
      68
          return resize_8x8_f32_sve128_sc(src, src_stride, src_width, src_height,
    
      34
                                          y_begin, y_end, dst, dst_stride);
    
        }
    
        // resize_linear_f32_is_implemented checked the kernel size already.
    
        // GCOVR_EXCL_START
    
        assert(!"resize ratio not implemented");
    
      −
        return KLEIDICV_ERROR_NOT_IMPLEMENTED;
    
        // GCOVR_EXCL_STOP
    
      387
      }
    
      }  // namespace KLEIDICV_TARGET_NAMESPACE
    
      #endif  // KLEIDICV_RESIZE_SC_H