KleidiCV Coverage Report

Directory:	./
File:	kleidicv/src/filters/scharr_neon.cpp
Date:	2025-09-25 14:13:34

	Exec	Total	Coverage
Lines:	106	106	100.0%
Functions:	13	13	100.0%
Branches:	18	18	100.0%

  
      Line
      Branch
      Exec
      Source
    
      // SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
    
      //
    
      // SPDX-License-Identifier: Apache-2.0
    
      #include <cstddef>
    
      #include <cstdint>
    
      #include <cstdlib>
    
      #include <memory>
    
      #include "kleidicv/config.h"
    
      #include "kleidicv/ctypes.h"
    
      #include "kleidicv/filters/scharr.h"
    
      #include "kleidicv/neon.h"
    
      #include "kleidicv/types.h"
    
      #include "kleidicv/utils.h"
    
      namespace kleidicv::neon {
    
      // Scharr filtering in both horizontal and vertical directions, horizontal and
    
      // vertical derivative approximations are stored interleaved.
    
      //
    
      // The applied weights for the horizontal approximation, as the kernel is
    
      // mirrored both vertically and horizontally during the convolution:
    
      //      [  -3   0   3 ]   [  3 ]
    
      //  F = [ -10   0  10 ] = [ 10 ] * [ -1, 0, 1 ]
    
      //      [  -3   0   3 ]   [  3 ]
    
      //
    
      // The applied weights for the vertical approximation, as the kernel is mirrored
    
      // both vertically and horizontally during the convolution:
    
      //      [ -3 -10  -3 ]   [ -1 ]
    
      //  F = [  0,  0,  0 ] = [  0 ] * [ 3, 10, 3 ]
    
      //      [  3  10   3 ]   [  1 ]
    
      //
    
      class ScharrInterleaved {
    
        using SourceType = uint8_t;
    
        using SourceVecTraits = VecTraits<SourceType>;
    
        using SourceVectorType = typename SourceVecTraits::VectorType;
    
        using BufferType = int16_t;
    
        using BufferVecTraits = VecTraits<BufferType>;
    
        using BufferVectorType = typename BufferVecTraits::VectorType;
    
        using BufferVector4Type = typename BufferVecTraits::Vector4Type;
    
        using DestinationType = int16_t;
    
        using DestinationVecTraits = VecTraits<DestinationType>;
    
        using DestinationVectorType = typename DestinationVecTraits::VectorType;
    
       public:
    
      67
        ScharrInterleaved(Rows<int16_t> hori_deriv_buffer,
    
                          Rows<int16_t> vert_deriv_buffer, size_t width)
    
      67
            : hori_deriv_buffer_(hori_deriv_buffer),
    
      67
              vert_deriv_buffer_(vert_deriv_buffer),
    
      67
              width_(width),
    
      67
              const_3_s16_(vdupq_n_s16(3)),
    
      67
              const_10_u8_(vdupq_n_u8(10)),
    
      67
              const_10_s16_(vdupq_n_s16(10)) {}
    
      67
        void process(Rows<const uint8_t> src_rows, Rows<int16_t> dst_rows,
    
                     size_t y_begin, size_t y_end) {
    
        2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1462 times.

      1529
          for (size_t i = y_begin; i < y_end; ++i) {
    
      1462
            process_vertical(src_rows.at(static_cast<ptrdiff_t>(i)));
    
      1462
            process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i)));
    
      1462
          }
    
      67
        }
    
       private:
    
      151
        BufferVector4Type vertical_vector_path(SourceVectorType src[3]) {
    
          // Horizontal derivative approximation
    
      151
          uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2]));
    
      302
          uint16x8_t hori_acc_h =
    
      151
              vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2]));
    
      151
          hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_);
    
      151
          hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_);
    
      151
          hori_acc_l =
    
      151
              vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_));
    
      151
          hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_);
    
          // Vertical derivative approximation
    
      151
          uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0]));
    
      302
          uint16x8_t vert_acc_h =
    
      151
              vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0]));
    
      151
          return {
    
      453
              vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h),
    
      302
              vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)};
    
      151
        }
    
      1462
        void process_vertical(Rows<const uint8_t> src_rows) {
    
      1462
          LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes};
    
      1613
          loop.unroll_once([&](ptrdiff_t index) {
    
      151
            SourceVectorType src[3];
    
      151
            src[0] = vld1q(&src_rows.at(0)[index]);
    
      151
            src[1] = vld1q(&src_rows.at(1)[index]);
    
      151
            src[2] = vld1q(&src_rows.at(2)[index]);
    
      151
            BufferVector4Type res = vertical_vector_path(src);
    
      151
            vst1q(&hori_deriv_buffer_[index], res.val[0]);
    
      151
            vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]);
    
      151
            vst1q(&vert_deriv_buffer_[index], res.val[2]);
    
      151
            vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]);
    
      151
          });
    
      9405
          loop.tail([&](ptrdiff_t index) {
    
      7943
            hori_deriv_buffer_[index] = static_cast<BufferType>(
    
      15886
                (src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 +
    
      7943
                src_rows.at(1)[index] * 10);
    
      7943
            vert_deriv_buffer_[index] = static_cast<BufferType>(
    
      7943
                src_rows.at(2)[index] - src_rows.at(0)[index]);
    
      7943
          });
    
      1462
        }
    
      500
        DestinationVectorType horizontal_vector_path_hori_approx(
    
            BufferVectorType buff[2]) {
    
      500
          return vsubq_s16(buff[1], buff[0]);
    
        }
    
      500
        DestinationVectorType horizontal_vector_path_vert_approx(
    
            BufferVectorType buff[3]) {
    
      500
          BufferVectorType a = vaddq_u16(buff[0], buff[2]);
    
      500
          a = vaddq_u16(a, vaddq_u16(a, a));
    
      1000
          return vmlaq_u16(a, buff[1], const_10_s16_);
    
      500
        }
    
      1462
        void process_horizontal(Rows<int16_t> dst_rows) {
    
          // width is decremented by 2 as the result has less columns.
    
      1462
          LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(),
    
                           kBufferVecNumLanes};
    
      1962
          loop.unroll_once([&](ptrdiff_t index) {
    
      500
            BufferVectorType hori_buff[2];
    
      500
            hori_buff[0] = vld1q(&hori_deriv_buffer_[index]);
    
      500
            hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]);
    
      1000
            DestinationVectorType hori_approx_res =
    
      500
                horizontal_vector_path_hori_approx(hori_buff);
    
      500
            BufferVectorType vert_buff[3];
    
      500
            vert_buff[0] = vld1q(&vert_deriv_buffer_[index]);
    
      500
            vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]);
    
      500
            vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]);
    
      1000
            DestinationVectorType vert_approx_res =
    
      500
                horizontal_vector_path_vert_approx(vert_buff);
    
      1000
            vst1q(&dst_rows.at(0, index)[0],
    
      500
                  vzip1q_s16(hori_approx_res, vert_approx_res));
    
      1000
            vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()],
    
      500
                  vzip2q_s16(hori_approx_res, vert_approx_res));
    
      500
          });
    
      4897
          loop.tail([&](ptrdiff_t index) {
    
      3435
            dst_rows.at(0, index)[0] = static_cast<DestinationType>(
    
                // For some reason clang-tidy thinks these accesses are invalid
    
                // NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign,
    
                // clang-analyzer-core.UndefinedBinaryOperatorResult)
    
      3435
                hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]);
    
            // NOLINTEND(clang-analyzer-core.uninitialized.Assign,
    
            // clang-analyzer-core.UndefinedBinaryOperatorResult)
    
      3435
            dst_rows.at(0, index)[1] = static_cast<DestinationType>(
    
      6870
                (vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 +
    
      3435
                vert_deriv_buffer_[index + 1] * 10);
    
      3435
          });
    
      1462
        }
    
        Rows<int16_t> hori_deriv_buffer_;
    
        Rows<int16_t> vert_deriv_buffer_;
    
        size_t width_;
    
        int16x8_t const_3_s16_;
    
        uint8x16_t const_10_u8_;
    
        int16x8_t const_10_s16_;
    
        static constexpr ptrdiff_t kSourceVecNumLanes =
    
            static_cast<ptrdiff_t>(SourceVecTraits::num_lanes());
    
        static constexpr ptrdiff_t kBufferVecNumLanes =
    
            static_cast<ptrdiff_t>(BufferVecTraits::num_lanes());
    
      };  // end of class ScharrInterleaved
    
      class ScharrBufferDeleter {
    
       public:
    
      67
        void operator()(void *ptr) const { std::free(ptr); }
    
      };
    
      KLEIDICV_TARGET_FN_ATTRS
    
      73
      kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8(
    
          const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
    
          size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin,
    
          size_t y_end) {
    
        // Does not include checks for whether the operation is implemented.
    
        // This must be done earlier, by scharr_interleaved_is_implemented.
    
        4/4✓ Branch 0 taken 1 times.
✓ Branch 1 taken 72 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 72 times.

      73
        CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
    
        4/4✓ Branch 0 taken 2 times.
✓ Branch 1 taken 70 times.
✓ Branch 2 taken 2 times.
✓ Branch 3 taken 70 times.

      72
        CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height);
    
        6/6✓ Branch 0 taken 1 times.
✓ Branch 1 taken 69 times.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 68 times.
✓ Branch 4 taken 2 times.
✓ Branch 5 taken 68 times.

      70
        CHECK_IMAGE_SIZE(src_width, src_height);
    
      68
        size_t buffer_stride = src_width * src_channels * sizeof(int16_t);
    
        // Buffer has two rows, one for the horizontal derivative approximation, one
    
        // for the vertical one.
    
      68
        size_t buffer_height = 2;
    
        // Memory is allocated with malloc to avoid its initialization.
    
      68
        void *allocation = std::malloc(buffer_stride * buffer_height);
    
        2/2✓ Branch 0 taken 67 times.
✓ Branch 1 taken 1 times.

      68
        if (!allocation) {
    
      1
          return KLEIDICV_ERROR_ALLOCATION;
    
        }
    
      134
        std::unique_ptr<int16_t, ScharrBufferDeleter> buffer(
    
      67
            reinterpret_cast<int16_t *>(allocation));
    
      67
        Rows<const uint8_t> src_rows{src, src_stride, src_channels};
    
        // Result is treated as it has double the channel number compared to the
    
        // input.
    
      67
        Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2};
    
      67
        Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels};
    
      134
        int16_t *vert_deriv_ptr = reinterpret_cast<int16_t *>(
    
      67
            reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride);
    
      67
        Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels};
    
      134
        ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width)
    
      67
            .process(src_rows, dst_rows, y_begin, y_end);
    
      67
        return KLEIDICV_OK;
    
      73
      }
    
      }  // namespace kleidicv::neon

Line	Branch	Exec	Source
1			// SPDX-FileCopyrightText: 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2			//
3			// SPDX-License-Identifier: Apache-2.0
4
5			#include <cstddef>
6			#include <cstdint>
7			#include <cstdlib>
8			#include <memory>
9
10			#include "kleidicv/config.h"
11			#include "kleidicv/ctypes.h"
12			#include "kleidicv/filters/scharr.h"
13			#include "kleidicv/neon.h"
14			#include "kleidicv/types.h"
15			#include "kleidicv/utils.h"
16
17			namespace kleidicv::neon {
18
19			// Scharr filtering in both horizontal and vertical directions, horizontal and
20			// vertical derivative approximations are stored interleaved.
21			//
22			// The applied weights for the horizontal approximation, as the kernel is
23			// mirrored both vertically and horizontally during the convolution:
24			// [ -3 0 3 ] [ 3 ]
25			// F = [ -10 0 10 ] = [ 10 ] * [ -1, 0, 1 ]
26			// [ -3 0 3 ] [ 3 ]
27			//
28			// The applied weights for the vertical approximation, as the kernel is mirrored
29			// both vertically and horizontally during the convolution:
30			// [ -3 -10 -3 ] [ -1 ]
31			// F = [ 0, 0, 0 ] = [ 0 ] * [ 3, 10, 3 ]
32			// [ 3 10 3 ] [ 1 ]
33			//
34			class ScharrInterleaved {
35			using SourceType = uint8_t;
36			using SourceVecTraits = VecTraits<SourceType>;
37			using SourceVectorType = typename SourceVecTraits::VectorType;
38			using BufferType = int16_t;
39			using BufferVecTraits = VecTraits<BufferType>;
40			using BufferVectorType = typename BufferVecTraits::VectorType;
41			using BufferVector4Type = typename BufferVecTraits::Vector4Type;
42			using DestinationType = int16_t;
43			using DestinationVecTraits = VecTraits<DestinationType>;
44			using DestinationVectorType = typename DestinationVecTraits::VectorType;
45
46			public:
47		67	ScharrInterleaved(Rows<int16_t> hori_deriv_buffer,
48			Rows<int16_t> vert_deriv_buffer, size_t width)
49		67	: hori_deriv_buffer_(hori_deriv_buffer),
50		67	vert_deriv_buffer_(vert_deriv_buffer),
51		67	width_(width),
52		67	const_3_s16_(vdupq_n_s16(3)),
53		67	const_10_u8_(vdupq_n_u8(10)),
54		67	const_10_s16_(vdupq_n_s16(10)) {}
55
56		67	void process(Rows<const uint8_t> src_rows, Rows<int16_t> dst_rows,
57			size_t y_begin, size_t y_end) {
58	2/2 ✓ Branch 0 taken 67 times. ✓ Branch 1 taken 1462 times.	1529	for (size_t i = y_begin; i < y_end; ++i) {
59		1462	process_vertical(src_rows.at(static_cast<ptrdiff_t>(i)));
60		1462	process_horizontal(dst_rows.at(static_cast<ptrdiff_t>(i)));
61		1462	}
62		67	}
63
64			private:
65		151	BufferVector4Type vertical_vector_path(SourceVectorType src[3]) {
66			// Horizontal derivative approximation
67		151	uint16x8_t hori_acc_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[2]));
68		302	uint16x8_t hori_acc_h =
69		151	vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[2]));
70
71		151	hori_acc_l = vmulq_u16(hori_acc_l, const_3_s16_);
72		151	hori_acc_h = vmulq_u16(hori_acc_h, const_3_s16_);
73
74		151	hori_acc_l =
75		151	vmlal_u8(hori_acc_l, vget_low_u8(src[1]), vget_low_u8(const_10_u8_));
76		151	hori_acc_h = vmlal_high_u8(hori_acc_h, src[1], const_10_u8_);
77
78			// Vertical derivative approximation
79		151	uint16x8_t vert_acc_l = vsubl_u8(vget_low_u8(src[2]), vget_low_u8(src[0]));
80		302	uint16x8_t vert_acc_h =
81		151	vsubl_u8(vget_high_u8(src[2]), vget_high_u8(src[0]));
82
83		151	return {
84		453	vreinterpretq_s16_u16(hori_acc_l), vreinterpretq_s16_u16(hori_acc_h),
85		302	vreinterpretq_s16_u16(vert_acc_l), vreinterpretq_s16_u16(vert_acc_h)};
86		151	}
87
88		1462	void process_vertical(Rows<const uint8_t> src_rows) {
89		1462	LoopUnroll2 loop{width_ * src_rows.channels(), kSourceVecNumLanes};
90
91		1613	loop.unroll_once([&](ptrdiff_t index) {
92		151	SourceVectorType src[3];
93		151	src[0] = vld1q(&src_rows.at(0)[index]);
94		151	src[1] = vld1q(&src_rows.at(1)[index]);
95		151	src[2] = vld1q(&src_rows.at(2)[index]);
96
97		151	BufferVector4Type res = vertical_vector_path(src);
98
99		151	vst1q(&hori_deriv_buffer_[index], res.val[0]);
100		151	vst1q(&hori_deriv_buffer_[index + kBufferVecNumLanes], res.val[1]);
101		151	vst1q(&vert_deriv_buffer_[index], res.val[2]);
102		151	vst1q(&vert_deriv_buffer_[index + kBufferVecNumLanes], res.val[3]);
103		151	});
104
105		9405	loop.tail([&](ptrdiff_t index) {
106		7943	hori_deriv_buffer_[index] = static_cast<BufferType>(
107		15886	(src_rows.at(0)[index] + src_rows.at(2)[index]) * 3 +
108		7943	src_rows.at(1)[index] * 10);
109
110		7943	vert_deriv_buffer_[index] = static_cast<BufferType>(
111		7943	src_rows.at(2)[index] - src_rows.at(0)[index]);
112		7943	});
113		1462	}
114
115		500	DestinationVectorType horizontal_vector_path_hori_approx(
116			BufferVectorType buff[2]) {
117		500	return vsubq_s16(buff[1], buff[0]);
118			}
119
120		500	DestinationVectorType horizontal_vector_path_vert_approx(
121			BufferVectorType buff[3]) {
122		500	BufferVectorType a = vaddq_u16(buff[0], buff[2]);
123		500	a = vaddq_u16(a, vaddq_u16(a, a));
124		1000	return vmlaq_u16(a, buff[1], const_10_s16_);
125		500	}
126
127		1462	void process_horizontal(Rows<int16_t> dst_rows) {
128			// width is decremented by 2 as the result has less columns.
129		1462	LoopUnroll2 loop{(width_ - 2) * hori_deriv_buffer_.channels(),
130			kBufferVecNumLanes};
131
132		1962	loop.unroll_once([&](ptrdiff_t index) {
133		500	BufferVectorType hori_buff[2];
134		500	hori_buff[0] = vld1q(&hori_deriv_buffer_[index]);
135		500	hori_buff[1] = vld1q(&hori_deriv_buffer_[index + 2]);
136		1000	DestinationVectorType hori_approx_res =
137		500	horizontal_vector_path_hori_approx(hori_buff);
138
139		500	BufferVectorType vert_buff[3];
140		500	vert_buff[0] = vld1q(&vert_deriv_buffer_[index]);
141		500	vert_buff[1] = vld1q(&vert_deriv_buffer_[index + 1]);
142		500	vert_buff[2] = vld1q(&vert_deriv_buffer_[index + 2]);
143		1000	DestinationVectorType vert_approx_res =
144		500	horizontal_vector_path_vert_approx(vert_buff);
145
146		1000	vst1q(&dst_rows.at(0, index)[0],
147		500	vzip1q_s16(hori_approx_res, vert_approx_res));
148		1000	vst1q(&dst_rows.at(0, index)[DestinationVecTraits::num_lanes()],
149		500	vzip2q_s16(hori_approx_res, vert_approx_res));
150		500	});
151
152		4897	loop.tail([&](ptrdiff_t index) {
153		3435	dst_rows.at(0, index)[0] = static_cast<DestinationType>(
154			// For some reason clang-tidy thinks these accesses are invalid
155			// NOLINTBEGIN(clang-analyzer-core.uninitialized.Assign,
156			// clang-analyzer-core.UndefinedBinaryOperatorResult)
157		3435	hori_deriv_buffer_[index + 2] - hori_deriv_buffer_[index]);
158			// NOLINTEND(clang-analyzer-core.uninitialized.Assign,
159			// clang-analyzer-core.UndefinedBinaryOperatorResult)
160
161		3435	dst_rows.at(0, index)[1] = static_cast<DestinationType>(
162		6870	(vert_deriv_buffer_[index] + vert_deriv_buffer_[index + 2]) * 3 +
163		3435	vert_deriv_buffer_[index + 1] * 10);
164		3435	});
165		1462	}
166
167			Rows<int16_t> hori_deriv_buffer_;
168			Rows<int16_t> vert_deriv_buffer_;
169			size_t width_;
170			int16x8_t const_3_s16_;
171			uint8x16_t const_10_u8_;
172			int16x8_t const_10_s16_;
173
174			static constexpr ptrdiff_t kSourceVecNumLanes =
175			static_cast<ptrdiff_t>(SourceVecTraits::num_lanes());
176			static constexpr ptrdiff_t kBufferVecNumLanes =
177			static_cast<ptrdiff_t>(BufferVecTraits::num_lanes());
178			}; // end of class ScharrInterleaved
179
180			class ScharrBufferDeleter {
181			public:
182		67	void operator()(void *ptr) const { std::free(ptr); }
183			};
184
185			KLEIDICV_TARGET_FN_ATTRS
186		73	kleidicv_error_t kleidicv_scharr_interleaved_stripe_s16_u8(
187			const uint8_t *src, size_t src_stride, size_t src_width, size_t src_height,
188			size_t src_channels, int16_t *dst, size_t dst_stride, size_t y_begin,
189			size_t y_end) {
190			// Does not include checks for whether the operation is implemented.
191			// This must be done earlier, by scharr_interleaved_is_implemented.
192	4/4 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 72 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 72 times.	73	CHECK_POINTER_AND_STRIDE(src, src_stride, src_height);
193	4/4 ✓ Branch 0 taken 2 times. ✓ Branch 1 taken 70 times. ✓ Branch 2 taken 2 times. ✓ Branch 3 taken 70 times.	72	CHECK_POINTER_AND_STRIDE(dst, dst_stride, src_height);
194	6/6 ✓ Branch 0 taken 1 times. ✓ Branch 1 taken 69 times. ✓ Branch 2 taken 1 times. ✓ Branch 3 taken 68 times. ✓ Branch 4 taken 2 times. ✓ Branch 5 taken 68 times.	70	CHECK_IMAGE_SIZE(src_width, src_height);
195
196		68	size_t buffer_stride = src_width * src_channels * sizeof(int16_t);
197			// Buffer has two rows, one for the horizontal derivative approximation, one
198			// for the vertical one.
199		68	size_t buffer_height = 2;
200			// Memory is allocated with malloc to avoid its initialization.
201		68	void allocation = std::malloc(buffer_stride buffer_height);
202
203	2/2 ✓ Branch 0 taken 67 times. ✓ Branch 1 taken 1 times.	68	if (!allocation) {
204		1	return KLEIDICV_ERROR_ALLOCATION;
205			}
206
207		134	std::unique_ptr<int16_t, ScharrBufferDeleter> buffer(
208		67	reinterpret_cast<int16_t *>(allocation));
209
210		67	Rows<const uint8_t> src_rows{src, src_stride, src_channels};
211
212			// Result is treated as it has double the channel number compared to the
213			// input.
214		67	Rows<int16_t> dst_rows{dst, dst_stride, src_channels * 2};
215
216		67	Rows<int16_t> hori_deriv_buffer{buffer.get(), buffer_stride, src_channels};
217
218		134	int16_t vert_deriv_ptr = reinterpret_cast<int16_t >(
219		67	reinterpret_cast<uint8_t *>(buffer.get()) + buffer_stride);
220		67	Rows<int16_t> vert_deriv_buffer{vert_deriv_ptr, buffer_stride, src_channels};
221
222		134	ScharrInterleaved(hori_deriv_buffer, vert_deriv_buffer, src_width)
223		67	.process(src_rows, dst_rows, y_begin, y_end);
224
225		67	return KLEIDICV_OK;
226		73	}
227
228			} // namespace kleidicv::neon
229