KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp
Date: 2025-09-25 14:13:34
Exec Total Coverage
Lines: 288 288 100.0%
Functions: 21 21 100.0%
Branches: 34 34 100.0%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "border_generic_neon.h"
9 #include "kleidicv/config.h"
10 #include "kleidicv/ctypes.h"
11 #include "kleidicv/filters/gaussian_blur.h"
12 #include "kleidicv/filters/sigma.h"
13 #include "kleidicv/neon.h"
14 #include "kleidicv/workspace/border_types.h"
15 #include "kleidicv/workspace/separable.h"
16
17 namespace kleidicv::neon {
18
19 // Template for arbitrary kernel size Gaussian Blur filters.
20 template <typename ScalarType, FixedBorderType>
21 class GaussianBlurArbitrary;
22
23 template <FixedBorderType BorderT>
24 class GaussianBlurArbitrary<uint8_t, BorderT> {
25 public:
26 using SourceType = uint8_t;
27 using BufferType = uint8_t;
28 using DestinationType = uint8_t;
29 using SourceVecTraits = typename neon::VecTraits<SourceType>;
30 using SourceVectorType = typename SourceVecTraits::VectorType;
31 using BufferVecTraits = typename neon::VecTraits<BufferType>;
32 using BufferVectorType = typename BufferVecTraits::VectorType;
33 using BorderType = FixedBorderType;
34
35 39 GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size,
36 Rectangle &rect, size_t channels)
37 39 : half_kernel_size_(half_kernel_size),
38 39 half_kernel_u16_(half_kernel),
39 39 width_(static_cast<ptrdiff_t>(rect.width())),
40 39 vertical_border_(rect.height()),
41 39 horizontal_border_(rect.width(), channels) {}
42
43 // Not border-affected parts
44 156 void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows,
45 Rows<BufferType> buffer_rows) const {
46 312 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47 156 SourceVecTraits::num_lanes()};
48
49 612 loop.unroll_once([&](size_t index) {
50 456 vertical_vector_path(src_rows, buffer_rows, index);
51 456 });
52
53 208 loop.tail([&](size_t index) {
54 52 vertical_scalar_path(src_rows, buffer_rows, index);
55 52 });
56 156 }
57
58 // Border-affected parts
59 360 void process_arbitrary_border_vertical(size_t width,
60 Rows<const SourceType> src_rows,
61 ptrdiff_t row_index,
62 Rows<BufferType> buffer_rows) const {
63 720 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
64 360 SourceVecTraits::num_lanes()};
65
66 1760 loop.unroll_once([&](size_t column_index) {
67 2800 vertical_border_vector_path(src_rows, buffer_rows, row_index,
68 1400 column_index);
69 1400 });
70
71 880 loop.tail([&](size_t column_index) {
72 1040 vertical_border_scalar_path(src_rows, buffer_rows, row_index,
73 520 column_index);
74 520 });
75 360 }
76
77 516 void process_arbitrary_horizontal(
78 size_t width, size_t kernel_size, Rows<BufferType> buffer_rows,
79 Rows<DestinationType> dst_rows) KLEIDICV_STREAMING {
80 516 size_t x = 0;
81 // Assume that there is always a widening when calculating, so the
82 // horizontal vector path processes double-width vectors
83 516 const size_t num_lanes = BufferVecTraits::num_lanes() / 2;
84 516 const size_t block_len = num_lanes;
85 516 const size_t margin = kernel_size / 2;
86 516 const size_t border_len = buffer_rows.channels() * margin;
87 1032 const size_t border_process_len =
88 516 ((border_len + block_len - 1) / block_len) * block_len;
89
90
2/2
✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.
1152 for (; x < border_process_len; x += num_lanes) {
91 636 horizontal_left_border_vector_path(buffer_rows, dst_rows, x);
92 636 }
93
94 // Process data which is not affected by any borders in bulk.
95
2/2
✓ Branch 0 taken 432 times.
✓ Branch 1 taken 84 times.
516 if (width * buffer_rows.channels() > 2 * border_process_len) {
96 864 size_t total_width_without_borders =
97 432 width * buffer_rows.channels() - 2 * border_process_len;
98
99 864 LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders,
100 432 BufferVecTraits::num_lanes()};
101
102 694 loop.unroll_twice([&](size_t index) {
103 262 horizontal_vector_path(buffer_rows, dst_rows, x + index);
104 524 horizontal_vector_path(buffer_rows, dst_rows,
105 262 x + index + BufferVecTraits::num_lanes());
106 262 });
107
108 1088 loop.unroll_once([&](size_t index) {
109 656 horizontal_vector_path(buffer_rows, dst_rows, x + index);
110 656 });
111
112 756 loop.tail([&](size_t index) {
113 324 horizontal_scalar_path(buffer_rows, dst_rows, x + index);
114 324 });
115
116 432 x += total_width_without_borders;
117 432 } else {
118 // rewind if needed, so we'll have exact vector paths at the right side
119 84 x = width * buffer_rows.channels() - border_process_len;
120 }
121
122
2/2
✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.
1152 for (; x < width * buffer_rows.channels(); x += num_lanes) {
123 636 horizontal_right_border_vector_path(buffer_rows, dst_rows, x);
124 636 }
125 516 }
126
127 private:
128 456 void vertical_vector_path(Rows<const SourceType> src_rows,
129 Rows<BufferType> dst_rows, ptrdiff_t x) const {
130 456 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
131 912 uint8x8_t half_kernel_mid = vdup_n_u8(
132 456 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
133 456 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
134 456 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
135
136 456 ptrdiff_t i = 0;
137 // Unroll 4 times
138
2/2
✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.
912 for (; i < half_kernel_size_ - 4; i += 4) {
139 456 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
140 456 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
141 456 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
142 456 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
143 456 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
144 456 uint16x8_t prod0_l = vmulq_u16(vec_l, coeff);
145 456 uint16x8_t prod0_h = vmulq_u16(vec_h, coeff);
146
147 456 src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]);
148 456 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]);
149 456 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
150 456 vec_h = vaddl_high_u8(src_i, src_j);
151 456 coeff = vdupq_n_u16(half_kernel_u16_[i + 1]);
152 456 uint16x8_t prod1_l = vmulq_u16(vec_l, coeff);
153 456 uint16x8_t prod1_h = vmulq_u16(vec_h, coeff);
154
155 456 src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]);
156 456 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]);
157 456 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
158 456 vec_h = vaddl_high_u8(src_i, src_j);
159 456 coeff = vdupq_n_u16(half_kernel_u16_[i + 2]);
160 456 uint16x8_t prod2_l = vmulq_u16(vec_l, coeff);
161 456 uint16x8_t prod2_h = vmulq_u16(vec_h, coeff);
162
163 456 src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]);
164 456 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]);
165 456 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
166 456 vec_h = vaddl_high_u8(src_i, src_j);
167 456 coeff = vdupq_n_u16(half_kernel_u16_[i + 3]);
168 456 uint16x8_t prod3_l = vmulq_u16(vec_l, coeff);
169 456 uint16x8_t prod3_h = vmulq_u16(vec_h, coeff);
170
171 456 uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l);
172 456 uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h);
173 456 uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l);
174 456 uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h);
175
176 456 uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l);
177 456 uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h);
178
179 456 acc_l = vaddq_u16(acc_l, acc_new_l);
180 456 acc_h = vaddq_u16(acc_h, acc_new_h);
181 456 }
182
183
2/2
✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.
912 for (; i < half_kernel_size_ - 1; ++i) {
184 456 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
185 456 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
186 456 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
187 456 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
188 456 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
189 456 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
190 456 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
191 456 }
192
193 // Rounding
194 456 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
195 456 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
196 // Keep only the highest 8 bits
197 912 uint8x16_t result =
198 456 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
199 456 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
200 456 }
201
202 // Where y is affected by border
203 1400 void vertical_border_vector_path(Rows<const SourceType> src_rows,
204 Rows<BufferType> dst_rows, ptrdiff_t y,
205 ptrdiff_t x) const {
206 1400 uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]);
207 2800 uint8x8_t half_kernel_mid = vdup_n_u8(
208 1400 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
209 1400 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
210 1400 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
211
212 1400 ptrdiff_t i = 0;
213
2/2
✓ Branch 0 taken 7000 times.
✓ Branch 1 taken 1400 times.
8400 for (; i < half_kernel_size_ - 1; ++i) {
214 7000 uint8x16_t src_i = vld1q_u8(&src_rows.at(
215 vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]);
216 7000 uint8x16_t src_j = vld1q_u8(&src_rows.at(
217 vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]);
218 7000 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
219 7000 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
220 7000 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
221 7000 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
222 7000 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
223 7000 }
224
225 // Rounding
226 1400 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
227 1400 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
228 // Keep only the highest 8 bits
229 2800 uint8x16_t result =
230 1400 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
231 1400 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
232 1400 }
233
234 52 void vertical_scalar_path(Rows<const SourceType> src_rows,
235 Rows<BufferType> dst_rows, ptrdiff_t x) const {
236 104 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
237 52 half_kernel_u16_[half_kernel_size_ - 1];
238
239
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 260 times.
312 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
240 260 acc +=
241 520 (static_cast<uint32_t>(src_rows.at(i + 1 - half_kernel_size_)[x]) +
242 520 static_cast<uint32_t>(src_rows.at(half_kernel_size_ - i - 1)[x])) *
243 260 half_kernel_u16_[i];
244 260 }
245
246 52 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
247 52 }
248
249 520 void vertical_border_scalar_path(Rows<const SourceType> src_rows,
250 Rows<BufferType> dst_rows, ptrdiff_t y,
251 ptrdiff_t x) const {
252 1040 uint32_t acc = static_cast<uint32_t>(src_rows.at(y)[x]) *
253 520 half_kernel_u16_[half_kernel_size_ - 1];
254
255
2/2
✓ Branch 0 taken 520 times.
✓ Branch 1 taken 2600 times.
3120 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
256 10400 acc += (static_cast<uint32_t>(src_rows.at(
257 7800 vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) +
258 7800 static_cast<uint32_t>(src_rows.at(vertical_border_.get_row(
259 7800 y + half_kernel_size_ - i - 1))[x])) *
260 2600 half_kernel_u16_[i];
261 2600 }
262
263 520 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
264 520 }
265
266 1180 void horizontal_vector_path(Rows<BufferType> src_rows,
267 Rows<DestinationType> dst_rows,
268 ptrdiff_t x) const {
269 // very similar to the vertical path, the difference is only the loading
270 // pattern
271 1180 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
272 2360 uint8x8_t half_kernel_mid = vdup_n_u8(
273 1180 static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1]));
274 1180 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
275 1180 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
276
277 2360 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()),
278 1180 left = x - ch * (half_kernel_size_ - 1),
279 1180 right = x + ch * (half_kernel_size_ - 1);
280
2/2
✓ Branch 0 taken 1180 times.
✓ Branch 1 taken 5900 times.
7080 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) {
281 5900 uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]);
282 5900 uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]);
283 5900 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
284 5900 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
285 5900 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
286 5900 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
287 5900 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
288 5900 }
289
290 // Rounding
291 1180 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
292 1180 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
293 // Keep only the highest 8 bits
294 2360 uint8x16_t result =
295 1180 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
296 1180 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
297 1180 }
298
299 636 void horizontal_left_border_vector_path(Rows<BufferType> src_rows,
300 Rows<DestinationType> dst_rows,
301 ptrdiff_t x) const {
302 // similar to the simple horizontal path, except the loading pattern:
303 // - this is loading indirect columns, and half of that data
304 636 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
305 1272 uint16x8_t acc =
306 636 vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
307
308 636 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
309 636 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
310 636 right = x + ch * (half_kernel_size_ - 1);
311
2/2
✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.
3456 for (; i * ch + left < 0; ++i) {
312 2820 uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch);
313 2820 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
314 2820 uint16x8_t vec = vaddq_u16(src_i, src_j);
315 2820 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
316 2820 acc = vmlaq_u16(acc, vec, coeff);
317 2820 }
318
319
2/2
✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.
996 for (; i < half_kernel_size_ - 1; ++i) {
320 360 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
321 360 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
322 360 uint16x8_t vec = vaddq_u16(src_i, src_j);
323 360 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
324 360 acc = vmlaq_u16(acc, vec, coeff);
325 360 }
326
327 // Store only the highest 8 bits
328 636 uint8x8_t result = vrshrn_n_u16(acc, 8);
329 636 vst1_u8(&dst_rows[x], result);
330 636 }
331
332 636 void horizontal_right_border_vector_path(Rows<BufferType> src_rows,
333 Rows<DestinationType> dst_rows,
334 ptrdiff_t x) const {
335 // similar to the simple horizontal path, except the loading pattern:
336 // - this is loading indirect columns, and half of that data
337 636 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
338 1272 uint16x8_t acc =
339 636 vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]);
340
341 636 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
342 636 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
343 636 right = x + ch * (half_kernel_size_ - 1);
344
2/2
✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.
3456 for (; right - i * ch > width_ * ch - 8; ++i) {
345 2820 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
346 5640 uint16x8_t src_j =
347 2820 horizontal_border_.load_right(src_rows, right - i * ch);
348 2820 uint16x8_t vec = vaddq_u16(src_i, src_j);
349 2820 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
350 2820 acc = vmlaq_u16(acc, vec, coeff);
351 2820 }
352
353
2/2
✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.
996 for (; i < half_kernel_size_ - 1; ++i) {
354 360 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
355 360 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
356 360 uint16x8_t vec = vaddq_u16(src_i, src_j);
357 360 uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]);
358 360 acc = vmlaq_u16(acc, vec, coeff);
359 360 }
360
361 // Store only the highest 8 bits
362 636 uint8x8_t result = vrshrn_n_u16(acc, 8);
363 636 vst1_u8(&dst_rows[x], result);
364 636 }
365
366 324 void horizontal_scalar_path(Rows<BufferType> src_rows,
367 Rows<DestinationType> dst_rows,
368 ptrdiff_t x) const {
369 648 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
370 324 half_kernel_u16_[half_kernel_size_ - 1];
371 324 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
372 324 ptrdiff_t channel_offset = x % ch;
373 324 ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1),
374 324 right_col = x / ch + (half_kernel_size_ - 1);
375
376
2/2
✓ Branch 0 taken 324 times.
✓ Branch 1 taken 1620 times.
1944 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
377 1620 acc += (static_cast<uint32_t>(
378 3240 src_rows[horizontal_border_.get_column(left_col + i) * ch +
379 3240 channel_offset]) +
380 static_cast<uint32_t>(
381 3240 src_rows[horizontal_border_.get_column(right_col - i) * ch +
382 3240 channel_offset])) *
383 1620 half_kernel_u16_[i];
384 1620 }
385
386 324 dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
387 324 }
388
389 const ptrdiff_t half_kernel_size_;
390 const uint16_t *half_kernel_u16_;
391 const ptrdiff_t width_;
392 KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_;
393 KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT>
394 horizontal_border_;
395 }; // end of class GaussianBlurArbitrary<uint8_t>
396
397 template <typename ScalarType>
398 54 static kleidicv_error_t gaussian_blur_arbitrary_kernel_size(
399 const ScalarType *src, size_t src_stride, ScalarType *dst,
400 size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin,
401 size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
402 SeparableFilterWorkspace *workspace) {
403 54 Rows<const ScalarType> src_rows{src, src_stride, channels};
404 54 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
405
406 108 const ptrdiff_t kHalfKernelSize =
407 54 static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size));
408 54 uint16_t half_kernel[128];
409 54 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
410 // If sigma is so small that the middle point gets all the weights, it's
411 // just a copy
412
2/2
✓ Branch 0 taken 39 times.
✓ Branch 1 taken 15 times.
54 if (half_kernel[kHalfKernelSize - 1] < 256) {
413 // Only replicated border is implemented so far.
414 78 GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{
415 39 half_kernel, kHalfKernelSize, rect, src_rows.channels()};
416 78 workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows,
417 39 dst_rows, channels, border_type, filter);
418 39 } else {
419
2/2
✓ Branch 0 taken 156 times.
✓ Branch 1 taken 15 times.
171 for (size_t row = y_begin; row < y_end; ++row) {
420 312 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
421 156 static_cast<const void *>(&src_rows.at(row)[0]),
422 156 rect.width() * sizeof(ScalarType) * dst_rows.channels());
423 156 }
424 }
425 54 return KLEIDICV_OK;
426 54 }
427
428 KLEIDICV_TARGET_FN_ATTRS
429 66 kleidicv_error_t gaussian_blur_arbitrary_stripe_u8(
430 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
431 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
432 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
433 float /*sigma_y*/, FixedBorderType fixed_border_type,
434 kleidicv_filter_context_t *context) {
435 66 auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context);
436 132 kleidicv_error_t checks_result = gaussian_blur_checks(
437 66 src, src_stride, dst, dst_stride, width, height, channels, workspace);
438
439
2/2
✓ Branch 0 taken 12 times.
✓ Branch 1 taken 54 times.
66 if (checks_result != KLEIDICV_OK) {
440 12 return checks_result;
441 }
442
443 54 Rectangle rect{width, height};
444
445 54 return gaussian_blur_arbitrary_kernel_size(
446 54 src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end,
447 54 channels, sigma_x, fixed_border_type, workspace);
448 66 }
449
450 } // namespace kleidicv::neon
451