KleidiCV Coverage Report


Directory: ./
File: kleidicv/src/filters/gaussian_blur_arbitrary_neon.cpp
Date: 2026-01-20 20:58:59
Exec Total Coverage
Lines: 287 287 100.0%
Functions: 21 21 100.0%
Branches: 42 43 97.7%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
2 //
3 // SPDX-License-Identifier: Apache-2.0
4
5 #include <cassert>
6 #include <cstddef>
7
8 #include "border_generic_neon.h"
9 #include "kleidicv/config.h"
10 #include "kleidicv/ctypes.h"
11 #include "kleidicv/filters/gaussian_blur.h"
12 #include "kleidicv/filters/sigma.h"
13 #include "kleidicv/neon.h"
14 #include "kleidicv/workspace/border_types.h"
15 #include "kleidicv/workspace/separable.h"
16
17 namespace kleidicv::neon {
18
19 // Template for arbitrary kernel size Gaussian Blur filters.
20 template <typename ScalarType, FixedBorderType>
21 class GaussianBlurArbitrary;
22
23 template <FixedBorderType BorderT>
24 class GaussianBlurArbitrary<uint8_t, BorderT> {
25 public:
26 using SourceType = uint8_t;
27 using BufferType = uint8_t;
28 using DestinationType = uint8_t;
29 using SourceVecTraits = typename neon::VecTraits<SourceType>;
30 using SourceVectorType = typename SourceVecTraits::VectorType;
31 using BufferVecTraits = typename neon::VecTraits<BufferType>;
32 using BufferVectorType = typename BufferVecTraits::VectorType;
33 using BorderType = FixedBorderType;
34
35 64 GaussianBlurArbitrary(const uint8_t *half_kernel, ptrdiff_t half_kernel_size,
36 Rectangle &rect, size_t channels)
37 64 : half_kernel_size_(half_kernel_size),
38 64 half_kernel_(half_kernel),
39 64 width_(static_cast<ptrdiff_t>(rect.width())),
40 64 vertical_border_(rect.height()),
41 64 horizontal_border_(rect.width(), channels) {}
42
43 // Not border-affected parts
44 208 void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows,
45 Rows<BufferType> buffer_rows) const {
46 416 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
47 208 SourceVecTraits::num_lanes()};
48
49 824 loop.unroll_once([&](size_t index) {
50 616 vertical_vector_path(src_rows, buffer_rows, index);
51 616 });
52
53 260 loop.tail([&](size_t index) {
54 52 vertical_scalar_path(src_rows, buffer_rows, index);
55 52 });
56 208 }
57
58 // Border-affected parts
59 480 void process_arbitrary_border_vertical(size_t width,
60 Rows<const SourceType> src_rows,
61 ptrdiff_t row_index,
62 Rows<BufferType> buffer_rows) const {
63 960 LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(),
64 480 SourceVecTraits::num_lanes()};
65
66 2680 loop.unroll_once([&](size_t column_index) {
67 4400 vertical_border_vector_path(src_rows, buffer_rows, row_index,
68 2200 column_index);
69 2200 });
70
71 1000 loop.tail([&](size_t column_index) {
72 1040 vertical_border_scalar_path(src_rows, buffer_rows, row_index,
73 520 column_index);
74 520 });
75 480 }
76
77 688 void process_arbitrary_horizontal(
78 size_t width, size_t kernel_size, Rows<BufferType> buffer_rows,
79 Rows<DestinationType> dst_rows) KLEIDICV_STREAMING {
80 688 size_t x = 0;
81 // Assume that there is always a widening when calculating, so the
82 // horizontal vector path processes double-width vectors
83 688 const size_t num_lanes = BufferVecTraits::num_lanes() / 2;
84 688 const size_t block_len = num_lanes;
85 688 const size_t margin = kernel_size / 2;
86 688 const size_t border_len = buffer_rows.channels() * margin;
87 1376 const size_t border_process_len =
88 688 ((border_len + block_len - 1) / block_len) * block_len;
89
90
2/2
✓ Branch 0 taken 848 times.
✓ Branch 1 taken 688 times.
1536 for (; x < border_process_len; x += num_lanes) {
91 848 horizontal_left_border_vector_path(buffer_rows, dst_rows, x);
92 848 }
93
94 // Process data which is not affected by any borders in bulk.
95
2/2
✓ Branch 0 taken 604 times.
✓ Branch 1 taken 84 times.
688 if (width * buffer_rows.channels() > 2 * border_process_len) {
96 1208 size_t total_width_without_borders =
97 604 width * buffer_rows.channels() - 2 * border_process_len;
98
99 1208 LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders,
100 604 BufferVecTraits::num_lanes()};
101
102 1088 loop.unroll_twice([&](size_t index) {
103 484 horizontal_vector_path(buffer_rows, dst_rows, x + index);
104 968 horizontal_vector_path(buffer_rows, dst_rows,
105 484 x + index + BufferVecTraits::num_lanes());
106 484 });
107
108 1564 loop.unroll_once([&](size_t index) {
109 960 horizontal_vector_path(buffer_rows, dst_rows, x + index);
110 960 });
111
112 928 loop.tail([&](size_t index) {
113 324 horizontal_scalar_path(buffer_rows, dst_rows, x + index);
114 324 });
115
116 604 x += total_width_without_borders;
117 604 } else {
118 // rewind if needed, so we'll have exact vector paths at the right side
119 84 x = width * buffer_rows.channels() - border_process_len;
120 }
121
122
2/2
✓ Branch 0 taken 848 times.
✓ Branch 1 taken 688 times.
1536 for (; x < width * buffer_rows.channels(); x += num_lanes) {
123 848 horizontal_right_border_vector_path(buffer_rows, dst_rows, x);
124 848 }
125 688 }
126
127 private:
128 616 void vertical_vector_path(Rows<const SourceType> src_rows,
129 Rows<BufferType> dst_rows, ptrdiff_t x) const {
130 616 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
131 616 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[half_kernel_size_ - 1]);
132 616 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
133 616 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
134
135 616 ptrdiff_t i = 0;
136 // Unroll 4 times
137
2/2
✓ Branch 0 taken 616 times.
✓ Branch 1 taken 616 times.
1232 for (; i < half_kernel_size_ - 4; i += 4) {
138 616 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
139 616 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
140 616 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
141 616 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
142 616 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
143 616 uint16x8_t prod0_l = vmulq_u16(vec_l, coeff);
144 616 uint16x8_t prod0_h = vmulq_u16(vec_h, coeff);
145
146 616 src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]);
147 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]);
148 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
149 616 vec_h = vaddl_high_u8(src_i, src_j);
150 616 coeff = vdupq_n_u16(half_kernel_[i + 1]);
151 616 uint16x8_t prod1_l = vmulq_u16(vec_l, coeff);
152 616 uint16x8_t prod1_h = vmulq_u16(vec_h, coeff);
153
154 616 src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]);
155 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]);
156 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
157 616 vec_h = vaddl_high_u8(src_i, src_j);
158 616 coeff = vdupq_n_u16(half_kernel_[i + 2]);
159 616 uint16x8_t prod2_l = vmulq_u16(vec_l, coeff);
160 616 uint16x8_t prod2_h = vmulq_u16(vec_h, coeff);
161
162 616 src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]);
163 616 src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]);
164 616 vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
165 616 vec_h = vaddl_high_u8(src_i, src_j);
166 616 coeff = vdupq_n_u16(half_kernel_[i + 3]);
167 616 uint16x8_t prod3_l = vmulq_u16(vec_l, coeff);
168 616 uint16x8_t prod3_h = vmulq_u16(vec_h, coeff);
169
170 616 uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l);
171 616 uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h);
172 616 uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l);
173 616 uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h);
174
175 616 uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l);
176 616 uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h);
177
178 616 acc_l = vaddq_u16(acc_l, acc_new_l);
179 616 acc_h = vaddq_u16(acc_h, acc_new_h);
180 616 }
181
182
2/2
✓ Branch 0 taken 616 times.
✓ Branch 1 taken 616 times.
1232 for (; i < half_kernel_size_ - 1; ++i) {
183 616 uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]);
184 616 uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]);
185 616 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
186 616 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
187 616 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
188 616 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
189 616 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
190 616 }
191
192 // Rounding
193 616 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
194 616 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
195 // Keep only the highest 8 bits
196 1232 uint8x16_t result =
197 616 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
198 616 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
199 616 }
200
201 // Where y is affected by border
202 2200 void vertical_border_vector_path(Rows<const SourceType> src_rows,
203 Rows<BufferType> dst_rows, ptrdiff_t y,
204 ptrdiff_t x) const {
205 2200 uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]);
206 2200 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[half_kernel_size_ - 1]);
207 2200 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
208 2200 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
209
210 2200 ptrdiff_t i = 0;
211
2/2
✓ Branch 0 taken 11000 times.
✓ Branch 1 taken 2200 times.
13200 for (; i < half_kernel_size_ - 1; ++i) {
212 33000 uint8x16_t src_i = vld1q_u8(&src_rows.at(
213 22000 vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]);
214 33000 uint8x16_t src_j = vld1q_u8(&src_rows.at(
215 22000 vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]);
216 11000 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
217 11000 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
218 11000 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
219 11000 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
220 11000 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
221 11000 }
222
223 // Rounding
224 2200 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
225 2200 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
226 // Keep only the highest 8 bits
227 4400 uint8x16_t result =
228 2200 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
229 2200 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
230 2200 }
231
232 52 void vertical_scalar_path(Rows<const SourceType> src_rows,
233 Rows<BufferType> dst_rows, ptrdiff_t x) const {
234 104 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
235 52 half_kernel_[half_kernel_size_ - 1];
236
237
2/2
✓ Branch 0 taken 52 times.
✓ Branch 1 taken 260 times.
312 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
238 780 acc += (src_rows.at(i + 1 - half_kernel_size_)[x] +
239 520 src_rows.at(half_kernel_size_ - i - 1)[x]) *
240 260 half_kernel_[i];
241 260 }
242
243 52 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
244 52 }
245
246 520 void vertical_border_scalar_path(Rows<const SourceType> src_rows,
247 Rows<BufferType> dst_rows, ptrdiff_t y,
248 ptrdiff_t x) const {
249 520 uint32_t acc = src_rows.at(y)[x] * half_kernel_[half_kernel_size_ - 1];
250
251
2/2
✓ Branch 0 taken 520 times.
✓ Branch 1 taken 2600 times.
3120 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
252 10400 acc += (src_rows.at(
253 7800 vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x] +
254 7800 src_rows.at(
255 7800 vertical_border_.get_row(y + half_kernel_size_ - i - 1))[x]) *
256 2600 half_kernel_[i];
257 2600 }
258
259 520 dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8));
260 520 }
261
262 1928 void horizontal_vector_path(Rows<BufferType> src_rows,
263 Rows<DestinationType> dst_rows,
264 ptrdiff_t x) const {
265 // very similar to the vertical path, the difference is only the loading
266 // pattern
267 1928 uint8x16_t src_mid = vld1q_u8(&src_rows[x]);
268 1928 uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[half_kernel_size_ - 1]);
269 1928 uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid);
270 1928 uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid);
271
272 3856 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()),
273 1928 left = x - ch * (half_kernel_size_ - 1),
274 1928 right = x + ch * (half_kernel_size_ - 1);
275
2/2
✓ Branch 0 taken 1928 times.
✓ Branch 1 taken 9640 times.
11568 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) {
276 9640 uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]);
277 9640 uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]);
278 9640 uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j));
279 9640 uint16x8_t vec_h = vaddl_high_u8(src_i, src_j);
280 9640 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
281 9640 acc_l = vmlaq_u16(acc_l, vec_l, coeff);
282 9640 acc_h = vmlaq_u16(acc_h, vec_h, coeff);
283 9640 }
284
285 // Rounding
286 1928 acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128));
287 1928 acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128));
288 // Keep only the highest 8 bits
289 3856 uint8x16_t result =
290 1928 vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h));
291 1928 neon::VecTraits<uint8_t>::store(result, &dst_rows[x]);
292 1928 }
293
294 848 void horizontal_left_border_vector_path(Rows<BufferType> src_rows,
295 Rows<DestinationType> dst_rows,
296 ptrdiff_t x) const {
297 // similar to the simple horizontal path, except the loading pattern:
298 // - this is loading indirect columns, and half of that data
299 848 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
300 848 uint16x8_t acc = vmulq_n_u16(src_mid, half_kernel_[half_kernel_size_ - 1]);
301
302 848 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
303 848 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
304 848 right = x + ch * (half_kernel_size_ - 1);
305
2/2
✓ Branch 0 taken 3760 times.
✓ Branch 1 taken 848 times.
4608 for (; i * ch + left < 0; ++i) {
306 3760 uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch);
307 3760 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
308 3760 uint16x8_t vec = vaddq_u16(src_i, src_j);
309 3760 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
310 3760 acc = vmlaq_u16(acc, vec, coeff);
311 3760 }
312
313
2/2
✓ Branch 0 taken 480 times.
✓ Branch 1 taken 848 times.
1328 for (; i < half_kernel_size_ - 1; ++i) {
314 480 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
315 480 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
316 480 uint16x8_t vec = vaddq_u16(src_i, src_j);
317 480 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
318 480 acc = vmlaq_u16(acc, vec, coeff);
319 480 }
320
321 // Store only the highest 8 bits
322 848 uint8x8_t result = vrshrn_n_u16(acc, 8);
323 848 vst1_u8(&dst_rows[x], result);
324 848 }
325
326 848 void horizontal_right_border_vector_path(Rows<BufferType> src_rows,
327 Rows<DestinationType> dst_rows,
328 ptrdiff_t x) const {
329 // similar to the simple horizontal path, except the loading pattern:
330 // - this is loading indirect columns, and half of that data
331 848 uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x]));
332 848 uint16x8_t acc = vmulq_n_u16(src_mid, half_kernel_[half_kernel_size_ - 1]);
333
334 848 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
335 848 ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1),
336 848 right = x + ch * (half_kernel_size_ - 1);
337
2/2
✓ Branch 0 taken 3760 times.
✓ Branch 1 taken 848 times.
4608 for (; right - i * ch > width_ * ch - 8; ++i) {
338 3760 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
339 7520 uint16x8_t src_j =
340 3760 horizontal_border_.load_right(src_rows, right - i * ch);
341 3760 uint16x8_t vec = vaddq_u16(src_i, src_j);
342 3760 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
343 3760 acc = vmlaq_u16(acc, vec, coeff);
344 3760 }
345
346
2/2
✓ Branch 0 taken 480 times.
✓ Branch 1 taken 848 times.
1328 for (; i < half_kernel_size_ - 1; ++i) {
347 480 uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch]));
348 480 uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch]));
349 480 uint16x8_t vec = vaddq_u16(src_i, src_j);
350 480 uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]);
351 480 acc = vmlaq_u16(acc, vec, coeff);
352 480 }
353
354 // Store only the highest 8 bits
355 848 uint8x8_t result = vrshrn_n_u16(acc, 8);
356 848 vst1_u8(&dst_rows[x], result);
357 848 }
358
359 324 void horizontal_scalar_path(Rows<BufferType> src_rows,
360 Rows<DestinationType> dst_rows,
361 ptrdiff_t x) const {
362 648 uint32_t acc = static_cast<uint32_t>(src_rows[x]) *
363 324 half_kernel_[half_kernel_size_ - 1];
364 324 ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels());
365 324 ptrdiff_t channel_offset = x % ch;
366 324 ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1),
367 324 right_col = x / ch + (half_kernel_size_ - 1);
368
369
2/2
✓ Branch 0 taken 324 times.
✓ Branch 1 taken 1620 times.
1944 for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) {
370 1620 acc += (static_cast<uint32_t>(
371 3240 src_rows[horizontal_border_.get_column(left_col + i) * ch +
372 3240 channel_offset]) +
373 static_cast<uint32_t>(
374 3240 src_rows[horizontal_border_.get_column(right_col - i) * ch +
375 3240 channel_offset])) *
376 1620 half_kernel_[i];
377 1620 }
378
379 324 dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8));
380 324 }
381
382 const ptrdiff_t half_kernel_size_;
383 const uint8_t *half_kernel_;
384 const ptrdiff_t width_;
385 KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_;
386 KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT>
387 horizontal_border_;
388 }; // end of class GaussianBlurArbitrary<uint8_t>
389
390 template <typename ScalarType>
391 84 static kleidicv_error_t gaussian_blur_arbitrary_kernel_size(
392 const ScalarType *src, size_t src_stride, ScalarType *dst,
393 size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin,
394 size_t y_end, size_t channels, float sigma, FixedBorderType border_type,
395 SeparableFilterWorkspace *workspace) {
396 84 Rows<const ScalarType> src_rows{src, src_stride, channels};
397 84 Rows<ScalarType> dst_rows{dst, dst_stride, channels};
398
399 168 const ptrdiff_t kHalfKernelSize =
400 84 static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size));
401 84 uint8_t half_kernel[128];
402 168 bool success =
403 84 generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma);
404
2/2
✓ Branch 0 taken 64 times.
✓ Branch 1 taken 20 times.
84 if (success) {
405 // Only replicated border is implemented so far.
406 128 GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{
407 64 half_kernel, kHalfKernelSize, rect, src_rows.channels()};
408 128 workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows,
409 64 dst_rows, channels, border_type, filter);
410 64 } else {
411 // Sigma is too small that the middle point would get all the weight
412 // => it's just a copy.
413
2/2
✓ Branch 0 taken 208 times.
✓ Branch 1 taken 20 times.
228 for (size_t row = y_begin; row < y_end; ++row) {
414 416 std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]),
415 208 static_cast<const void *>(&src_rows.at(row)[0]),
416 208 rect.width() * sizeof(ScalarType) * dst_rows.channels());
417 208 }
418 }
419 84 return KLEIDICV_OK;
420 84 }
421
422 KLEIDICV_TARGET_FN_ATTRS
423 92 kleidicv_error_t gaussian_blur_arbitrary_stripe_u8(
424 const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride,
425 size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels,
426 size_t kernel_width, size_t /*kernel_height*/, float sigma_x,
427 float /*sigma_y*/, FixedBorderType fixed_border_type) {
428
6/6
✓ Branch 0 taken 4 times.
✓ Branch 1 taken 88 times.
✓ Branch 2 taken 4 times.
✓ Branch 3 taken 88 times.
✓ Branch 4 taken 4 times.
✓ Branch 5 taken 88 times.
276 if (auto result =
429 92 gaussian_blur_checks(src, src_stride, dst, dst_stride, width, height);
430
2/3
✗ Branch 0 not taken.
✓ Branch 1 taken 4 times.
✓ Branch 2 taken 88 times.
96 result != KLEIDICV_OK) {
431 4 return result;
432 }
433
434 88 Rectangle rect{width, height};
435 // As we cannot predict the intermediate size based on the parameters given,
436 // just use the largest possible immediate size out of all available
437 // operations.
438 88 auto workspace =
439 88 SeparableFilterWorkspace::create(rect, channels, sizeof(uint32_t));
440
2/2
✓ Branch 0 taken 84 times.
✓ Branch 1 taken 4 times.
88 if (!workspace) {
441 4 return KLEIDICV_ERROR_ALLOCATION;
442 }
443
444 84 return gaussian_blur_arbitrary_kernel_size(
445 84 src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end,
446 84 channels, sigma_x, fixed_border_type, workspace.get());
447 92 }
448
449 } // namespace kleidicv::neon
450