Line | Branch | Exec | Source |
---|---|---|---|
1 | // SPDX-FileCopyrightText: 2025 Arm Limited and/or its affiliates <open-source-office@arm.com> | ||
2 | // | ||
3 | // SPDX-License-Identifier: Apache-2.0 | ||
4 | |||
5 | #include <cassert> | ||
6 | #include <cstddef> | ||
7 | |||
8 | #include "border_generic_neon.h" | ||
9 | #include "kleidicv/config.h" | ||
10 | #include "kleidicv/ctypes.h" | ||
11 | #include "kleidicv/filters/gaussian_blur.h" | ||
12 | #include "kleidicv/filters/sigma.h" | ||
13 | #include "kleidicv/neon.h" | ||
14 | #include "kleidicv/workspace/border_types.h" | ||
15 | #include "kleidicv/workspace/separable.h" | ||
16 | |||
17 | namespace kleidicv::neon { | ||
18 | |||
19 | // Template for arbitrary kernel size Gaussian Blur filters. | ||
20 | template <typename ScalarType, FixedBorderType> | ||
21 | class GaussianBlurArbitrary; | ||
22 | |||
23 | template <FixedBorderType BorderT> | ||
24 | class GaussianBlurArbitrary<uint8_t, BorderT> { | ||
25 | public: | ||
26 | using SourceType = uint8_t; | ||
27 | using BufferType = uint8_t; | ||
28 | using DestinationType = uint8_t; | ||
29 | using SourceVecTraits = typename neon::VecTraits<SourceType>; | ||
30 | using SourceVectorType = typename SourceVecTraits::VectorType; | ||
31 | using BufferVecTraits = typename neon::VecTraits<BufferType>; | ||
32 | using BufferVectorType = typename BufferVecTraits::VectorType; | ||
33 | using BorderType = FixedBorderType; | ||
34 | |||
35 | 39 | GaussianBlurArbitrary(const uint16_t *half_kernel, ptrdiff_t half_kernel_size, | |
36 | Rectangle &rect, size_t channels) | ||
37 | 39 | : half_kernel_size_(half_kernel_size), | |
38 | 39 | half_kernel_u16_(half_kernel), | |
39 | 39 | width_(static_cast<ptrdiff_t>(rect.width())), | |
40 | 39 | vertical_border_(rect.height()), | |
41 | 39 | horizontal_border_(rect.width(), channels) {} | |
42 | |||
43 | // Not border-affected parts | ||
44 | 156 | void process_arbitrary_vertical(size_t width, Rows<const SourceType> src_rows, | |
45 | Rows<BufferType> buffer_rows) const { | ||
46 | 312 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
47 | 156 | SourceVecTraits::num_lanes()}; | |
48 | |||
49 | 612 | loop.unroll_once([&](size_t index) { | |
50 | 456 | vertical_vector_path(src_rows, buffer_rows, index); | |
51 | 456 | }); | |
52 | |||
53 | 208 | loop.tail([&](size_t index) { | |
54 | 52 | vertical_scalar_path(src_rows, buffer_rows, index); | |
55 | 52 | }); | |
56 | 156 | } | |
57 | |||
58 | // Border-affected parts | ||
59 | 360 | void process_arbitrary_border_vertical(size_t width, | |
60 | Rows<const SourceType> src_rows, | ||
61 | ptrdiff_t row_index, | ||
62 | Rows<BufferType> buffer_rows) const { | ||
63 | 720 | LoopUnroll2<TryToAvoidTailLoop> loop{width * src_rows.channels(), | |
64 | 360 | SourceVecTraits::num_lanes()}; | |
65 | |||
66 | 1760 | loop.unroll_once([&](size_t column_index) { | |
67 | 2800 | vertical_border_vector_path(src_rows, buffer_rows, row_index, | |
68 | 1400 | column_index); | |
69 | 1400 | }); | |
70 | |||
71 | 880 | loop.tail([&](size_t column_index) { | |
72 | 1040 | vertical_border_scalar_path(src_rows, buffer_rows, row_index, | |
73 | 520 | column_index); | |
74 | 520 | }); | |
75 | 360 | } | |
76 | |||
77 | 516 | void process_arbitrary_horizontal( | |
78 | size_t width, size_t kernel_size, Rows<BufferType> buffer_rows, | ||
79 | Rows<DestinationType> dst_rows) KLEIDICV_STREAMING { | ||
80 | 516 | size_t x = 0; | |
81 | // Assume that there is always a widening when calculating, so the | ||
82 | // horizontal vector path processes double-width vectors | ||
83 | 516 | const size_t num_lanes = BufferVecTraits::num_lanes() / 2; | |
84 | 516 | const size_t block_len = num_lanes; | |
85 | 516 | const size_t margin = kernel_size / 2; | |
86 | 516 | const size_t border_len = buffer_rows.channels() * margin; | |
87 | 1032 | const size_t border_process_len = | |
88 | 516 | ((border_len + block_len - 1) / block_len) * block_len; | |
89 | |||
90 |
2/2✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.
|
1152 | for (; x < border_process_len; x += num_lanes) { |
91 | 636 | horizontal_left_border_vector_path(buffer_rows, dst_rows, x); | |
92 | 636 | } | |
93 | |||
94 | // Process data which is not affected by any borders in bulk. | ||
95 |
2/2✓ Branch 0 taken 432 times.
✓ Branch 1 taken 84 times.
|
516 | if (width * buffer_rows.channels() > 2 * border_process_len) { |
96 | 864 | size_t total_width_without_borders = | |
97 | 432 | width * buffer_rows.channels() - 2 * border_process_len; | |
98 | |||
99 | 864 | LoopUnroll2<TryToAvoidTailLoop> loop{total_width_without_borders, | |
100 | 432 | BufferVecTraits::num_lanes()}; | |
101 | |||
102 | 694 | loop.unroll_twice([&](size_t index) { | |
103 | 262 | horizontal_vector_path(buffer_rows, dst_rows, x + index); | |
104 | 524 | horizontal_vector_path(buffer_rows, dst_rows, | |
105 | 262 | x + index + BufferVecTraits::num_lanes()); | |
106 | 262 | }); | |
107 | |||
108 | 1088 | loop.unroll_once([&](size_t index) { | |
109 | 656 | horizontal_vector_path(buffer_rows, dst_rows, x + index); | |
110 | 656 | }); | |
111 | |||
112 | 756 | loop.tail([&](size_t index) { | |
113 | 324 | horizontal_scalar_path(buffer_rows, dst_rows, x + index); | |
114 | 324 | }); | |
115 | |||
116 | 432 | x += total_width_without_borders; | |
117 | 432 | } else { | |
118 | // rewind if needed, so we'll have exact vector paths at the right side | ||
119 | 84 | x = width * buffer_rows.channels() - border_process_len; | |
120 | } | ||
121 | |||
122 |
2/2✓ Branch 0 taken 636 times.
✓ Branch 1 taken 516 times.
|
1152 | for (; x < width * buffer_rows.channels(); x += num_lanes) { |
123 | 636 | horizontal_right_border_vector_path(buffer_rows, dst_rows, x); | |
124 | 636 | } | |
125 | 516 | } | |
126 | |||
127 | private: | ||
128 | 456 | void vertical_vector_path(Rows<const SourceType> src_rows, | |
129 | Rows<BufferType> dst_rows, ptrdiff_t x) const { | ||
130 | 456 | uint8x16_t src_mid = vld1q_u8(&src_rows[x]); | |
131 | 912 | uint8x8_t half_kernel_mid = vdup_n_u8( | |
132 | 456 | static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1])); | |
133 | 456 | uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); | |
134 | 456 | uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); | |
135 | |||
136 | 456 | ptrdiff_t i = 0; | |
137 | // Unroll 4 times | ||
138 |
2/2✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.
|
912 | for (; i < half_kernel_size_ - 4; i += 4) { |
139 | 456 | uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]); | |
140 | 456 | uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]); | |
141 | 456 | uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
142 | 456 | uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); | |
143 | 456 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
144 | 456 | uint16x8_t prod0_l = vmulq_u16(vec_l, coeff); | |
145 | 456 | uint16x8_t prod0_h = vmulq_u16(vec_h, coeff); | |
146 | |||
147 | 456 | src_i = vld1q_u8(&src_rows.at(i + 2 - half_kernel_size_)[x]); | |
148 | 456 | src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 2)[x]); | |
149 | 456 | vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
150 | 456 | vec_h = vaddl_high_u8(src_i, src_j); | |
151 | 456 | coeff = vdupq_n_u16(half_kernel_u16_[i + 1]); | |
152 | 456 | uint16x8_t prod1_l = vmulq_u16(vec_l, coeff); | |
153 | 456 | uint16x8_t prod1_h = vmulq_u16(vec_h, coeff); | |
154 | |||
155 | 456 | src_i = vld1q_u8(&src_rows.at(i + 3 - half_kernel_size_)[x]); | |
156 | 456 | src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 3)[x]); | |
157 | 456 | vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
158 | 456 | vec_h = vaddl_high_u8(src_i, src_j); | |
159 | 456 | coeff = vdupq_n_u16(half_kernel_u16_[i + 2]); | |
160 | 456 | uint16x8_t prod2_l = vmulq_u16(vec_l, coeff); | |
161 | 456 | uint16x8_t prod2_h = vmulq_u16(vec_h, coeff); | |
162 | |||
163 | 456 | src_i = vld1q_u8(&src_rows.at(i + 4 - half_kernel_size_)[x]); | |
164 | 456 | src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 4)[x]); | |
165 | 456 | vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
166 | 456 | vec_h = vaddl_high_u8(src_i, src_j); | |
167 | 456 | coeff = vdupq_n_u16(half_kernel_u16_[i + 3]); | |
168 | 456 | uint16x8_t prod3_l = vmulq_u16(vec_l, coeff); | |
169 | 456 | uint16x8_t prod3_h = vmulq_u16(vec_h, coeff); | |
170 | |||
171 | 456 | uint16x8_t acc0_l = vaddq_u16(prod0_l, prod1_l); | |
172 | 456 | uint16x8_t acc0_h = vaddq_u16(prod0_h, prod1_h); | |
173 | 456 | uint16x8_t acc1_l = vaddq_u16(prod2_l, prod3_l); | |
174 | 456 | uint16x8_t acc1_h = vaddq_u16(prod2_h, prod3_h); | |
175 | |||
176 | 456 | uint16x8_t acc_new_l = vaddq_u16(acc0_l, acc1_l); | |
177 | 456 | uint16x8_t acc_new_h = vaddq_u16(acc0_h, acc1_h); | |
178 | |||
179 | 456 | acc_l = vaddq_u16(acc_l, acc_new_l); | |
180 | 456 | acc_h = vaddq_u16(acc_h, acc_new_h); | |
181 | 456 | } | |
182 | |||
183 |
2/2✓ Branch 0 taken 456 times.
✓ Branch 1 taken 456 times.
|
912 | for (; i < half_kernel_size_ - 1; ++i) { |
184 | 456 | uint8x16_t src_i = vld1q_u8(&src_rows.at(i - half_kernel_size_ + 1)[x]); | |
185 | 456 | uint8x16_t src_j = vld1q_u8(&src_rows.at(half_kernel_size_ - i - 1)[x]); | |
186 | 456 | uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
187 | 456 | uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); | |
188 | 456 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
189 | 456 | acc_l = vmlaq_u16(acc_l, vec_l, coeff); | |
190 | 456 | acc_h = vmlaq_u16(acc_h, vec_h, coeff); | |
191 | 456 | } | |
192 | |||
193 | // Rounding | ||
194 | 456 | acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); | |
195 | 456 | acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); | |
196 | // Keep only the highest 8 bits | ||
197 | 912 | uint8x16_t result = | |
198 | 456 | vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); | |
199 | 456 | neon::VecTraits<uint8_t>::store(result, &dst_rows[x]); | |
200 | 456 | } | |
201 | |||
202 | // Where y is affected by border | ||
203 | 1400 | void vertical_border_vector_path(Rows<const SourceType> src_rows, | |
204 | Rows<BufferType> dst_rows, ptrdiff_t y, | ||
205 | ptrdiff_t x) const { | ||
206 | 1400 | uint8x16_t src_mid = vld1q_u8(&src_rows.at(y)[x]); | |
207 | 2800 | uint8x8_t half_kernel_mid = vdup_n_u8( | |
208 | 1400 | static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1])); | |
209 | 1400 | uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); | |
210 | 1400 | uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); | |
211 | |||
212 | 1400 | ptrdiff_t i = 0; | |
213 |
2/2✓ Branch 0 taken 7000 times.
✓ Branch 1 taken 1400 times.
|
8400 | for (; i < half_kernel_size_ - 1; ++i) { |
214 | 7000 | uint8x16_t src_i = vld1q_u8(&src_rows.at( | |
215 | vertical_border_.get_row(y - half_kernel_size_ + 1 + i))[x]); | ||
216 | 7000 | uint8x16_t src_j = vld1q_u8(&src_rows.at( | |
217 | vertical_border_.get_row(y + half_kernel_size_ - 1 - i))[x]); | ||
218 | 7000 | uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
219 | 7000 | uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); | |
220 | 7000 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
221 | 7000 | acc_l = vmlaq_u16(acc_l, vec_l, coeff); | |
222 | 7000 | acc_h = vmlaq_u16(acc_h, vec_h, coeff); | |
223 | 7000 | } | |
224 | |||
225 | // Rounding | ||
226 | 1400 | acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); | |
227 | 1400 | acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); | |
228 | // Keep only the highest 8 bits | ||
229 | 2800 | uint8x16_t result = | |
230 | 1400 | vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); | |
231 | 1400 | neon::VecTraits<uint8_t>::store(result, &dst_rows[x]); | |
232 | 1400 | } | |
233 | |||
234 | 52 | void vertical_scalar_path(Rows<const SourceType> src_rows, | |
235 | Rows<BufferType> dst_rows, ptrdiff_t x) const { | ||
236 | 104 | uint32_t acc = static_cast<uint32_t>(src_rows[x]) * | |
237 | 52 | half_kernel_u16_[half_kernel_size_ - 1]; | |
238 | |||
239 |
2/2✓ Branch 0 taken 52 times.
✓ Branch 1 taken 260 times.
|
312 | for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { |
240 | 260 | acc += | |
241 | 520 | (static_cast<uint32_t>(src_rows.at(i + 1 - half_kernel_size_)[x]) + | |
242 | 520 | static_cast<uint32_t>(src_rows.at(half_kernel_size_ - i - 1)[x])) * | |
243 | 260 | half_kernel_u16_[i]; | |
244 | 260 | } | |
245 | |||
246 | 52 | dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8)); | |
247 | 52 | } | |
248 | |||
249 | 520 | void vertical_border_scalar_path(Rows<const SourceType> src_rows, | |
250 | Rows<BufferType> dst_rows, ptrdiff_t y, | ||
251 | ptrdiff_t x) const { | ||
252 | 1040 | uint32_t acc = static_cast<uint32_t>(src_rows.at(y)[x]) * | |
253 | 520 | half_kernel_u16_[half_kernel_size_ - 1]; | |
254 | |||
255 |
2/2✓ Branch 0 taken 520 times.
✓ Branch 1 taken 2600 times.
|
3120 | for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { |
256 | 10400 | acc += (static_cast<uint32_t>(src_rows.at( | |
257 | 7800 | vertical_border_.get_row(y + i + 1 - half_kernel_size_))[x]) + | |
258 | 7800 | static_cast<uint32_t>(src_rows.at(vertical_border_.get_row( | |
259 | 7800 | y + half_kernel_size_ - i - 1))[x])) * | |
260 | 2600 | half_kernel_u16_[i]; | |
261 | 2600 | } | |
262 | |||
263 | 520 | dst_rows[x] = static_cast<BufferType>(rounding_shift_right(acc, 8)); | |
264 | 520 | } | |
265 | |||
266 | 1180 | void horizontal_vector_path(Rows<BufferType> src_rows, | |
267 | Rows<DestinationType> dst_rows, | ||
268 | ptrdiff_t x) const { | ||
269 | // very similar to the vertical path, the difference is only the loading | ||
270 | // pattern | ||
271 | 1180 | uint8x16_t src_mid = vld1q_u8(&src_rows[x]); | |
272 | 2360 | uint8x8_t half_kernel_mid = vdup_n_u8( | |
273 | 1180 | static_cast<uint8_t>(half_kernel_u16_[half_kernel_size_ - 1])); | |
274 | 1180 | uint16x8_t acc_l = vmull_u8(vget_low_u8(src_mid), half_kernel_mid); | |
275 | 1180 | uint16x8_t acc_h = vmull_u8(vget_high_u8(src_mid), half_kernel_mid); | |
276 | |||
277 | 2360 | ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()), | |
278 | 1180 | left = x - ch * (half_kernel_size_ - 1), | |
279 | 1180 | right = x + ch * (half_kernel_size_ - 1); | |
280 |
2/2✓ Branch 0 taken 1180 times.
✓ Branch 1 taken 5900 times.
|
7080 | for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; ++i) { |
281 | 5900 | uint8x16_t src_i = vld1q_u8(&src_rows[left + i * ch]); | |
282 | 5900 | uint8x16_t src_j = vld1q_u8(&src_rows[right - i * ch]); | |
283 | 5900 | uint16x8_t vec_l = vaddl_u8(vget_low_u8(src_i), vget_low_u8(src_j)); | |
284 | 5900 | uint16x8_t vec_h = vaddl_high_u8(src_i, src_j); | |
285 | 5900 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
286 | 5900 | acc_l = vmlaq_u16(acc_l, vec_l, coeff); | |
287 | 5900 | acc_h = vmlaq_u16(acc_h, vec_h, coeff); | |
288 | 5900 | } | |
289 | |||
290 | // Rounding | ||
291 | 1180 | acc_l = vqaddq_u16(acc_l, vdupq_n_u16(128)); | |
292 | 1180 | acc_h = vqaddq_u16(acc_h, vdupq_n_u16(128)); | |
293 | // Keep only the highest 8 bits | ||
294 | 2360 | uint8x16_t result = | |
295 | 1180 | vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); | |
296 | 1180 | neon::VecTraits<uint8_t>::store(result, &dst_rows[x]); | |
297 | 1180 | } | |
298 | |||
299 | 636 | void horizontal_left_border_vector_path(Rows<BufferType> src_rows, | |
300 | Rows<DestinationType> dst_rows, | ||
301 | ptrdiff_t x) const { | ||
302 | // similar to the simple horizontal path, except the loading pattern: | ||
303 | // - this is loading indirect columns, and half of that data | ||
304 | 636 | uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x])); | |
305 | 1272 | uint16x8_t acc = | |
306 | 636 | vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]); | |
307 | |||
308 | 636 | ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()); | |
309 | 636 | ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1), | |
310 | 636 | right = x + ch * (half_kernel_size_ - 1); | |
311 |
2/2✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.
|
3456 | for (; i * ch + left < 0; ++i) { |
312 | 2820 | uint16x8_t src_i = horizontal_border_.load_left(src_rows, left + i * ch); | |
313 | 2820 | uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); | |
314 | 2820 | uint16x8_t vec = vaddq_u16(src_i, src_j); | |
315 | 2820 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
316 | 2820 | acc = vmlaq_u16(acc, vec, coeff); | |
317 | 2820 | } | |
318 | |||
319 |
2/2✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.
|
996 | for (; i < half_kernel_size_ - 1; ++i) { |
320 | 360 | uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); | |
321 | 360 | uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); | |
322 | 360 | uint16x8_t vec = vaddq_u16(src_i, src_j); | |
323 | 360 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
324 | 360 | acc = vmlaq_u16(acc, vec, coeff); | |
325 | 360 | } | |
326 | |||
327 | // Store only the highest 8 bits | ||
328 | 636 | uint8x8_t result = vrshrn_n_u16(acc, 8); | |
329 | 636 | vst1_u8(&dst_rows[x], result); | |
330 | 636 | } | |
331 | |||
332 | 636 | void horizontal_right_border_vector_path(Rows<BufferType> src_rows, | |
333 | Rows<DestinationType> dst_rows, | ||
334 | ptrdiff_t x) const { | ||
335 | // similar to the simple horizontal path, except the loading pattern: | ||
336 | // - this is loading indirect columns, and half of that data | ||
337 | 636 | uint16x8_t src_mid = vmovl_u8(vld1_u8(&src_rows[x])); | |
338 | 1272 | uint16x8_t acc = | |
339 | 636 | vmulq_n_u16(src_mid, half_kernel_u16_[half_kernel_size_ - 1]); | |
340 | |||
341 | 636 | ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()); | |
342 | 636 | ptrdiff_t i = 0, left = x - ch * (half_kernel_size_ - 1), | |
343 | 636 | right = x + ch * (half_kernel_size_ - 1); | |
344 |
2/2✓ Branch 0 taken 2820 times.
✓ Branch 1 taken 636 times.
|
3456 | for (; right - i * ch > width_ * ch - 8; ++i) { |
345 | 2820 | uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); | |
346 | 5640 | uint16x8_t src_j = | |
347 | 2820 | horizontal_border_.load_right(src_rows, right - i * ch); | |
348 | 2820 | uint16x8_t vec = vaddq_u16(src_i, src_j); | |
349 | 2820 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
350 | 2820 | acc = vmlaq_u16(acc, vec, coeff); | |
351 | 2820 | } | |
352 | |||
353 |
2/2✓ Branch 0 taken 360 times.
✓ Branch 1 taken 636 times.
|
996 | for (; i < half_kernel_size_ - 1; ++i) { |
354 | 360 | uint16x8_t src_i = vmovl_u8(vld1_u8(&src_rows[left + i * ch])); | |
355 | 360 | uint16x8_t src_j = vmovl_u8(vld1_u8(&src_rows[right - i * ch])); | |
356 | 360 | uint16x8_t vec = vaddq_u16(src_i, src_j); | |
357 | 360 | uint16x8_t coeff = vdupq_n_u16(half_kernel_u16_[i]); | |
358 | 360 | acc = vmlaq_u16(acc, vec, coeff); | |
359 | 360 | } | |
360 | |||
361 | // Store only the highest 8 bits | ||
362 | 636 | uint8x8_t result = vrshrn_n_u16(acc, 8); | |
363 | 636 | vst1_u8(&dst_rows[x], result); | |
364 | 636 | } | |
365 | |||
366 | 324 | void horizontal_scalar_path(Rows<BufferType> src_rows, | |
367 | Rows<DestinationType> dst_rows, | ||
368 | ptrdiff_t x) const { | ||
369 | 648 | uint32_t acc = static_cast<uint32_t>(src_rows[x]) * | |
370 | 324 | half_kernel_u16_[half_kernel_size_ - 1]; | |
371 | 324 | ptrdiff_t ch = static_cast<ptrdiff_t>(src_rows.channels()); | |
372 | 324 | ptrdiff_t channel_offset = x % ch; | |
373 | 324 | ptrdiff_t left_col = x / ch - (half_kernel_size_ - 1), | |
374 | 324 | right_col = x / ch + (half_kernel_size_ - 1); | |
375 | |||
376 |
2/2✓ Branch 0 taken 324 times.
✓ Branch 1 taken 1620 times.
|
1944 | for (ptrdiff_t i = 0; i < half_kernel_size_ - 1; i++) { |
377 | 1620 | acc += (static_cast<uint32_t>( | |
378 | 3240 | src_rows[horizontal_border_.get_column(left_col + i) * ch + | |
379 | 3240 | channel_offset]) + | |
380 | static_cast<uint32_t>( | ||
381 | 3240 | src_rows[horizontal_border_.get_column(right_col - i) * ch + | |
382 | 3240 | channel_offset])) * | |
383 | 1620 | half_kernel_u16_[i]; | |
384 | 1620 | } | |
385 | |||
386 | 324 | dst_rows[x] = static_cast<DestinationType>(rounding_shift_right(acc, 8)); | |
387 | 324 | } | |
388 | |||
389 | const ptrdiff_t half_kernel_size_; | ||
390 | const uint16_t *half_kernel_u16_; | ||
391 | const ptrdiff_t width_; | ||
392 | KLEIDICV_TARGET_NAMESPACE::GenericBorderVertical<BorderT> vertical_border_; | ||
393 | KLEIDICV_TARGET_NAMESPACE::GenericBorderHorizontal<BorderT> | ||
394 | horizontal_border_; | ||
395 | }; // end of class GaussianBlurArbitrary<uint8_t> | ||
396 | |||
397 | template <typename ScalarType> | ||
398 | 54 | static kleidicv_error_t gaussian_blur_arbitrary_kernel_size( | |
399 | const ScalarType *src, size_t src_stride, ScalarType *dst, | ||
400 | size_t dst_stride, Rectangle &rect, size_t kernel_size, size_t y_begin, | ||
401 | size_t y_end, size_t channels, float sigma, FixedBorderType border_type, | ||
402 | SeparableFilterWorkspace *workspace) { | ||
403 | 54 | Rows<const ScalarType> src_rows{src, src_stride, channels}; | |
404 | 54 | Rows<ScalarType> dst_rows{dst, dst_stride, channels}; | |
405 | |||
406 | 108 | const ptrdiff_t kHalfKernelSize = | |
407 | 54 | static_cast<ptrdiff_t>(get_half_kernel_size(kernel_size)); | |
408 | 54 | uint16_t half_kernel[128]; | |
409 | 54 | generate_gaussian_half_kernel(half_kernel, kHalfKernelSize, sigma); | |
410 | // If sigma is so small that the middle point gets all the weights, it's | ||
411 | // just a copy | ||
412 |
2/2✓ Branch 0 taken 39 times.
✓ Branch 1 taken 15 times.
|
54 | if (half_kernel[kHalfKernelSize - 1] < 256) { |
413 | // Only replicated border is implemented so far. | ||
414 | 78 | GaussianBlurArbitrary<ScalarType, FixedBorderType::REPLICATE> filter{ | |
415 | 39 | half_kernel, kHalfKernelSize, rect, src_rows.channels()}; | |
416 | 78 | workspace->process_arbitrary(rect, kernel_size, y_begin, y_end, src_rows, | |
417 | 39 | dst_rows, channels, border_type, filter); | |
418 | 39 | } else { | |
419 |
2/2✓ Branch 0 taken 156 times.
✓ Branch 1 taken 15 times.
|
171 | for (size_t row = y_begin; row < y_end; ++row) { |
420 | 312 | std::memcpy(static_cast<void *>(&dst_rows.at(row)[0]), | |
421 | 156 | static_cast<const void *>(&src_rows.at(row)[0]), | |
422 | 156 | rect.width() * sizeof(ScalarType) * dst_rows.channels()); | |
423 | 156 | } | |
424 | } | ||
425 | 54 | return KLEIDICV_OK; | |
426 | 54 | } | |
427 | |||
428 | KLEIDICV_TARGET_FN_ATTRS | ||
429 | 66 | kleidicv_error_t gaussian_blur_arbitrary_stripe_u8( | |
430 | const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, | ||
431 | size_t width, size_t height, size_t y_begin, size_t y_end, size_t channels, | ||
432 | size_t kernel_width, size_t /*kernel_height*/, float sigma_x, | ||
433 | float /*sigma_y*/, FixedBorderType fixed_border_type, | ||
434 | kleidicv_filter_context_t *context) { | ||
435 | 66 | auto *workspace = reinterpret_cast<SeparableFilterWorkspace *>(context); | |
436 | 132 | kleidicv_error_t checks_result = gaussian_blur_checks( | |
437 | 66 | src, src_stride, dst, dst_stride, width, height, channels, workspace); | |
438 | |||
439 |
2/2✓ Branch 0 taken 12 times.
✓ Branch 1 taken 54 times.
|
66 | if (checks_result != KLEIDICV_OK) { |
440 | 12 | return checks_result; | |
441 | } | ||
442 | |||
443 | 54 | Rectangle rect{width, height}; | |
444 | |||
445 | 54 | return gaussian_blur_arbitrary_kernel_size( | |
446 | 54 | src, src_stride, dst, dst_stride, rect, kernel_width, y_begin, y_end, | |
447 | 54 | channels, sigma_x, fixed_border_type, workspace); | |
448 | 66 | } | |
449 | |||
450 | } // namespace kleidicv::neon | ||
451 |