KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp
Date: 2025-11-25 17:23:32
Exec Total Coverage
Lines: 288 288 100.0%
Functions: 21 21 100.0%
Branches: 34 34 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "border_generic_neon.h"
9 #include "kleidicv/config.h"
10 #include "kleidicv/ctypes.h"
11 #include "kleidicv/filters/gaussian_blur.h"
12 #include "kleidicv/filters/sigma.h"
13 #include "kleidicv/neon.h"
14 #include "kleidicv/workspace/border_types.h"
15 #include "kleidicv/workspace/separable.h"
16
17 namespace kleidicv::neon {
18
19 // Template for arbitrary kernel size Gaussian Blur filters.
20 template <typename ScalarType, FixedBorderType>
21 class GaussianBlurArbitrary;
22
23 template <FixedBorderType BorderT>
24 class GaussianBlurArbitrary<uint8_t, BorderT> {
25 public:
26 using SourceType = uint8_t;
27 using BufferType = uint8_t;
28 using DestinationType = uint8_t;
29 using SourceVecTraits = typename neon::VecTraits<SourceType>;
30 using SourceVectorType = typename SourceVecTraits::VectorType;
31 using BufferVecTraits = typename neon::VecTraits<BufferType>;
32 using BufferVectorType = typename BufferVecTraits::VectorType;
33 using BorderType = FixedBorderType;
34
35 52 GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size,
36 Rectangle &rect, size_t channels)
37 52 : half_kernel_size_(half_kernel_size),
38 52 half_kernel_u16_(half_kernel),
39 52 width_(static_cast<ptrdiff_t>(rect.width())),
40 52 vertical_border_(rect.height()),
41 52 horizontal_border_(rect.width(), channels) {}
42
43 // Not border-affected parts
44 208 void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows,
45 Rows<BufferType> buffer_rows) const {
46 416 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47 208 SourceVecTraits::num_lanes()};
48
49 824 loop.unroll_once([&](size_t index) {
50 616 vertical_vector_path(src_rows, buffer_rows, index);
51 616 });
52
53 260 loop.tail([&](size_t index) {
54 52 vertical_scalar_path(src_rows, buffer_rows, index);
55 52 });
56 208 }
57
58 // Border-affected parts
59 480 void process_arbitrary_border_vertical(size_t width,
60 Rows<const SourceType> src_rows,
61 ptrdiff_t row_index,
62 Rows<BufferType> buffer_rows) const {
63 960 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
64 480 SourceVecTraits::num_lanes()};
65
66 2680 loop.unroll_once([&](size_t column_index) {
67 4400 vertical_border_vector_path(src_rows, buffer_rows, row_index,
68 2200 column_index);
69 2200 });
70
71 1000 loop.tail([&](size_t column_index) {
72 1040 vertical_border_scalar_path(src_rows, buffer_rows, row_index,
73 520 column_index);
74 520 });
75 480 }
76
77 688 void process_arbitrary_horizontal(
78 size_t width, size_t kernel_size, Rows<BufferType> buffer_rows,
79 Rows<DestinationType> dst_rows) KLEIDICV_STREAMING {
80 688 size_t x = 0;
81 // Assume that there is always a widening when calculating, so the
82 // horizontal vector path processes double-width vectors
83 688 const size_t num_lanes = BufferVecTraits::num_lanes() / 2;
84 688 const size_t block_len = num_lanes;
85 688 const size_t margin = kernel_size / 2;
86 688 const size_t border_len = buffer_rows.channels() * margin;
87 1376 const size_t border_process_len =
88 688 ((border_len + block_len - 1) / block_len) * block_len;
89
90
2/2
✓ Branch 0 taken 848 times.
✓ Branch 1 taken 688 times.
1536 for (; x < border_process_len; x += num_lanes) {
91 848 horizontal_left_border_vector_path(buffer_rows, dst_rows, x);
92 848 }
93
94 // Process data which is not affected by any borders in bulk.
95
2/2
✓ Branch 0 taken 604 times.
✓ Branch 1 taken 84 times.
688 if (width * buffer_rows.channels() > 2 * border_process_len) {
96 1208 size_t total_width_without_borders =
97 604 width * buffer_rows.channels() - 2 * border_process_len;
98
99 1208 LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders,
100 604 BufferVecTraits::num_lanes()};
101
102 1088 loop.unroll_twice([&](size_t index) {
103 484 horizontal_vector_path(buffer_rows, dst_rows, x + index);
104 968 horizontal_vector_path(buffer_rows, dst_rows,
105 484 x + index + BufferVecTraits::num_lanes());
106 484 });
107
108 1564 loop.unroll_once([&](size_t index) {
109 960 horizontal_vector_path(buffer_rows, dst_rows, x + index);
110 960 });
111
112 928 loop.tail([&](size_t index) {
113 324 horizontal_scalar_path(buffer_rows, dst_rows, x + index);
114 324 });
115
116 604 x += total_width_without_borders;
117 604 } else {
118 // rewind if needed, so we'll have exact vector paths at the right side
119 84 x = width * buffer_rows.channels() - border_process_len;
120 }
121
122
2/2
✓ Branch 0 taken 848 times.
✓ Branch 1 taken 688 times.
1536 for (; x < width * buffer_rows.channels(); x += num_lanes) {
123 848 horizontal_right_border_vector_path(buffer_rows, dst_rows, x);
124 848 }
125 688 }
126
127 private:
128 616 void vertical_vector_path(Rows<const SourceType> src_rows,
129 Rows<BufferType> dst_rows, ptrdiff_t x) const {
130 616 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
131 1232 uint8x8_t half_kernel_mid = vdup_n_u8(
132 616 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
133 616 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
134 616 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
135
136 616 ptrdiff_t i = 0;
137 // Unroll 4 times
138
2/2
✓ Branch 0 taken 616 times.
✓ Branch 1 taken 616 times.
1232 for (; i < half_kernel_size_ - 4; i += 4) {
139 616 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
140 616 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
141 616 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
142 616 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
143 616 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
144 616 uint16x8_t prod0_l = vmulq_u16(vec_l, coeff);
145 616 uint16x8_t prod0_h = vmulq_u16(vec_h, coeff);
146
147 616 src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]);
148 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]);
149 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
150 616 vec_h = vaddl_high_u8(src_i, src_j);
151 616 coeff = vdupq_n_u16(half_kernel_u16_[i + 1]);
152 616 uint16x8_t prod1_l = vmulq_u16(vec_l, coeff);
153 616 uint16x8_t prod1_h = vmulq_u16(vec_h, coeff);
154
155 616 src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]);
156 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]);
157 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
158 616 vec_h = vaddl_high_u8(src_i, src_j);
159 616 coeff = vdupq_n_u16(half_kernel_u16_[i + 2]);
160 616 uint16x8_t prod2_l = vmulq_u16(vec_l, coeff);
161 616 uint16x8_t prod2_h = vmulq_u16(vec_h, coeff);
162
163 616 src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]);
164 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]);
165 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
166 616 vec_h = vaddl_high_u8(src_i, src_j);
167 616 coeff = vdupq_n_u16(half_kernel_u16_[i + 3]);
168 616 uint16x8_t prod3_l = vmulq_u16(vec_l, coeff);
169 616 uint16x8_t prod3_h = vmulq_u16(vec_h, coeff);
170
171 616 uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l);
172 616 uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h);
173 616 uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l);
174 616 uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h);
175
176 616 uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l);
177 616 uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h);
178
179 616 acc_l = vaddq_u16(acc_l, acc_new_l);
180 616 acc_h = vaddq_u16(acc_h, acc_new_h);
181 616 }
182
183
2/2
✓ Branch 0 taken 616 times.
✓ Branch 1 taken 616 times.
1232 for (; i < half_kernel_size_ - 1; ++i) {
184 616 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
185 616 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
186 616 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
187 616 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
188 616 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
189 616 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
190 616 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
191 616 }
192
193 // Rounding
194 616 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
195 616 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
196 // Keep only the highest 8 bits
197 1232 uint8x16_t result =
198 616 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
199 616 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
200 616 }
201
202 // Where y is affected by border
203 2200 void vertical_border_vector_path(Rows<const SourceType> src_rows,
204 Rows<BufferType> dst_rows, ptrdiff_t y,
205 ptrdiff_t x) const {
206 2200 uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]);
207 4400 uint8x8_t half_kernel_mid = vdup_n_u8(
208 2200 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
209 2200 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
210 2200 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
211
212 2200 ptrdiff_t i = 0;
213
2/2
✓ Branch 0 taken 11000 times.
✓ Branch 1 taken 2200 times.
13200 for (; i < half_kernel_size_ - 1; ++i) {
214 11000 uint8x16_t src_i = vld1q_u8(&src_rows.at(
215 vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]);
216 11000 uint8x16_t src_j = vld1q_u8(&src_rows.at(
217 vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]);
218 11000 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
219 11000 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
220 11000 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
221 11000 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
222 11000 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
223 11000 }
224
225 // Rounding
226 2200 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
227 2200 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
228 // Keep only the highest 8 bits
229 4400 uint8x16_t result =
230 2200 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
231 2200 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
232 2200 }
233
234 52 void vertical_scalar_path(Rows<const SourceType> src_rows,
235 Rows<BufferType> dst_rows, ptrdiff_t x) const {
236 104 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
237 52 half_kernel_u16_[half_kernel_size_ - 1];
238
239
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 260 times.
312 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
240 260 acc +=
241 520 (static_cast<uint32_t>(src_rows.at(i + 1 - half_kernel_size_)[x]) +
242 520 static_cast<uint32_t>(src_rows.at(half_kernel_size_ - i - 1)[x])) *
243 260 half_kernel_u16_[i];
244 260 }
245
246 52 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
247 52 }
248
249 520 void vertical_border_scalar_path(Rows<const SourceType> src_rows,
250 Rows<BufferType> dst_rows, ptrdiff_t y,
251 ptrdiff_t x) const {
252 1040 uint32_t acc = static_cast<uint32_t>(src_rows.at(y)[x]) *
253 520 half_kernel_u16_[half_kernel_size_ - 1];
254
255
2/2
✓ Branch 0 taken 520 times.
✓ Branch 1 taken 2600 times.
3120 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
256 10400 acc += (static_cast<uint32_t>(src_rows.at(
257 7800 vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) +
258 7800 static_cast<uint32_t>(src_rows.at(vertical_border_.get_row(
259 7800 y + half_kernel_size_ - i - 1))[x])) *
260 2600 half_kernel_u16_[i];
261 2600 }
262
263 520 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
264 520 }
265
266 1928 void horizontal_vector_path(Rows<BufferType> src_rows,
267 Rows<DestinationType> dst_rows,
268 ptrdiff_t x) const {
269 // very similar to the vertical path, the difference is only the loading
270 // pattern
271 1928 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
272 3856 uint8x8_t half_kernel_mid = vdup_n_u8(
273 1928 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
274 1928 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
275 1928 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
276
277 3856 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()),
278 1928 left = x - ch * (half_kernel_size_ - 1),
279 1928 right = x + ch * (half_kernel_size_ - 1);
280
2/2
✓ Branch 0 taken 1928 times.
✓ Branch 1 taken 9640 times.
11568 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) {
281 9640 uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]);
282 9640 uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]);
283 9640 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
284 9640 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
285 9640 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
286 9640 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
287 9640 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
288 9640 }
289
290 // Rounding
291 1928 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
292 1928 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
293 // Keep only the highest 8 bits
294 3856 uint8x16_t result =
295 1928 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
296 1928 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
297 1928 }
298
299 848 void horizontal_left_border_vector_path(Rows<BufferType> src_rows,
300 Rows<DestinationType> dst_rows,
301 ptrdiff_t x) const {
302 // similar to the simple horizontal path, except the loading pattern:
303 // - this is loading indirect columns, and half of that data
304 848 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
305 1696 uint16x8_t acc =
306 848 vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
307
308 848 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
309 848 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
310 848 right = x + ch * (half_kernel_size_ - 1);
311
2/2
✓ Branch 0 taken 3760 times.
✓ Branch 1 taken 848 times.
4608 for (; i * ch + left < 0; ++i) {
312 3760 uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch);
313 3760 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
314 3760 uint16x8_t vec = vaddq_u16(src_i, src_j);
315 3760 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
316 3760 acc = vmlaq_u16(acc, vec, coeff);
317 3760 }
318
319
2/2
✓ Branch 0 taken 480 times.
✓ Branch 1 taken 848 times.
1328 for (; i < half_kernel_size_ - 1; ++i) {
320 480 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
321 480 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
322 480 uint16x8_t vec = vaddq_u16(src_i, src_j);
323 480 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
324 480 acc = vmlaq_u16(acc, vec, coeff);
325 480 }
326
327 // Store only the highest 8 bits
328 848 uint8x8_t result = vrshrn_n_u16(acc, 8);
329 848 vst1_u8(&dst_rows[x], result);
330 848 }
331
332 848 void horizontal_right_border_vector_path(Rows<BufferType> src_rows,
333 Rows<DestinationType> dst_rows,
334 ptrdiff_t x) const {
335 // similar to the simple horizontal path, except the loading pattern:
336 // - this is loading indirect columns, and half of that data
337 848 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
338 1696 uint16x8_t acc =
339 848 vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
340
341 848 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
342 848 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
343 848 right = x + ch * (half_kernel_size_ - 1);
344
2/2
✓ Branch 0 taken 3760 times.
✓ Branch 1 taken 848 times.
4608 for (; right - i * ch > width_ * ch - 8; ++i) {
345 3760 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
346 7520 uint16x8_t src_j =
347 3760 horizontal_border_.load_right(src_rows, right - i * ch);
348 3760 uint16x8_t vec = vaddq_u16(src_i, src_j);
349 3760 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
350 3760 acc = vmlaq_u16(acc, vec, coeff);
351 3760 }
352
353
2/2
✓ Branch 0 taken 480 times.
✓ Branch 1 taken 848 times.
1328 for (; i < half_kernel_size_ - 1; ++i) {
354 480 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
355 480 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
356 480 uint16x8_t vec = vaddq_u16(src_i, src_j);
357 480 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
358 480 acc = vmlaq_u16(acc, vec, coeff);
359 480 }
360
361 // Store only the highest 8 bits
362 848 uint8x8_t result = vrshrn_n_u16(acc, 8);
363 848 vst1_u8(&dst_rows[x], result);
364 848 }
365
366 324 void horizontal_scalar_path(Rows<BufferType> src_rows,
367 Rows<DestinationType> dst_rows,
368 ptrdiff_t x) const {
369 648 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
370 324 half_kernel_u16_[half_kernel_size_ - 1];
371 324 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
372 324 ptrdiff_t channel_offset = x % ch;
373 324 ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1),
374 324 right_col = x / ch + (half_kernel_size_ - 1);
375
376
2/2
✓ Branch 0 taken 324 times.
✓ Branch 1 taken 1620 times.
1944 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
377 1620 acc += (static_cast<uint32_t>(
378 3240 src_rows[horizontal_border_.get_column(left_col + i) * ch +
379 3240 channel_offset]) +
380 static_cast<uint32_t>(
381 3240 src_rows[horizontal_border_.get_column(right_col - i) * ch +
382 3240 channel_offset])) *
383 1620 half_kernel_u16_[i];
384 1620 }
385
386 324 dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
387 324 }
388
389 const ptrdiff_t half_kernel_size_;
390 const uint16_t *half_kernel_u16_;
391 const ptrdiff_t width_;
392 KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_;
393 KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT>
394 horizontal_border_;
395 }; // end of class GaussianBlurArbitrary<uint8_t>
396
397 template <typename ScalarType>
398 72 static kleidicv_error_t gaussian_blur_arbitrary_kernel_size(
399 const ScalarType *src, size_t src_stride, ScalarType *dst,
400 size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin,
401 size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
402 SeparableFilterWorkspace *workspace) {
403 72 Rows<const ScalarType> src_rows{src, src_stride, channels};
404 72 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
405
406 144 const ptrdiff_t kHalfKernelSize =
407 72 static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size));
408 72 uint16_t half_kernel[128];
409 72 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
410 // If sigma is so small that the middle point gets all the weights, it's
411 // just a copy
412
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 20 times.
72 if (half_kernel[kHalfKernelSize - 1] < 256) {
413 // Only replicated border is implemented so far.
414 104 GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{
415 52 half_kernel, kHalfKernelSize, rect, src_rows.channels()};
416 104 workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows,
417 52 dst_rows, channels, border_type, filter);
418 52 } else {
419
2/2
✓ Branch 0 taken 208 times.
✓ Branch 1 taken 20 times.
228 for (size_t row = y_begin; row < y_end; ++row) {
420 416 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
421 208 static_cast<const void *>(&src_rows.at(row)[0]),
422 208 rect.width() * sizeof(ScalarType) * dst_rows.channels());
423 208 }
424 }
425 72 return KLEIDICV_OK;
426 72 }
427
428 KLEIDICV_TARGET_FN_ATTRS
429 88 kleidicv_error_t gaussian_blur_arbitrary_stripe_u8(
430 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
431 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
432 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
433 float /*sigma_y*/, FixedBorderType fixed_border_type,
434 kleidicv_filter_context_t *context) {
435 88 auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
436 176 kleidicv_error_t checks_result = gaussian_blur_checks(
437 88 src, src_stride, dst, dst_stride, width, height, channels, workspace);
438
439
2/2
✓ Branch 0 taken 16 times.
✓ Branch 1 taken 72 times.
88 if (checks_result != KLEIDICV_OK) {
440 16 return checks_result;
441 }
442
443 72 Rectangle rect{width, height};
444
445 72 return gaussian_blur_arbitrary_kernel_size(
446 72 src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end,
447 72 channels, sigma_x, fixed_border_type, workspace);
448 88 }
449
450 } // namespace kleidicv::neon
451