Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47
48typedef union {
49 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50 unsigned int w[64 /*NUMSTATES*/ / 32];
51 unsigned short s[64 /*NUMSTATES*/ / 16];
52 unsigned char c[64 /*NUMSTATES*/ / 8];
53#ifdef _MSC_VER
55#else
56} decision_t __attribute__((aligned(16)));
57#endif
58
59
60static inline void renormalize(unsigned char* X)
61{
62 int NUMSTATES = 64;
63 int i;
64
65 unsigned char min = X[0];
66 for (i = 0; i < NUMSTATES; i++) {
67 if (min > X[i]) {
68 min = X[i];
69 }
70 }
71 for (i = 0; i < NUMSTATES; i++) {
72 X[i] -= min;
73 }
74}
75
76
77// helper BFLY for GENERIC version
78static inline void BFLY(int i,
79 int s,
80 unsigned char* syms,
81 unsigned char* Y,
82 unsigned char* X,
83 decision_t* d,
84 unsigned char* Branchtab)
85{
86 int j;
87 unsigned int decision0, decision1;
88 unsigned char metric, m0, m1, m2, m3;
89 unsigned short metricsum;
90
91 int NUMSTATES = 64;
92 int RATE = 2;
93 int METRICSHIFT = 1;
94 int PRECISIONSHIFT = 2;
95
96 metricsum = 1;
97 for (j = 0; j < RATE; j++) {
98 metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
99 }
100 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
101
102 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
103
104 m0 = X[i] + metric;
105 m1 = X[i + NUMSTATES / 2] + (max - metric);
106 m2 = X[i] + (max - metric);
107 m3 = X[i + NUMSTATES / 2] + metric;
108
109 decision0 = (signed int)(m0 - m1) >= 0;
110 decision1 = (signed int)(m2 - m3) >= 0;
111
112 Y[2 * i] = decision0 ? m1 : m0;
113 Y[2 * i + 1] = decision1 ? m3 : m2;
114
115 d->w[i / (sizeof(unsigned int) * 8 / 2) +
116 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
117 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
118}
119
120
121#if LV_HAVE_AVX2
122
123#include <immintrin.h>
124#include <stdio.h>
125
126static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
127 unsigned char* X,
128 unsigned char* syms,
129 unsigned char* dec,
130 unsigned int framebits,
131 unsigned int excess,
132 unsigned char* Branchtab)
133{
134 unsigned int i;
135 for (i = 0; i < framebits + excess; i++) {
136 unsigned char* tmp;
137 unsigned int* dec_int = (unsigned int*)dec;
138 __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
139 m26, s18, s19, s22, s23, t14, t15;
140
141 // Butterfly
142 s18 = ((__m256i*)X)[0];
143 s19 = ((__m256i*)X)[1];
144 a76 = _mm256_set1_epi8(syms[2 * i]);
145 a78 = ((__m256i*)Branchtab)[0];
146 a79 = _mm256_xor_si256(a76, a78);
147 a82 = _mm256_set1_epi8(syms[2 * i + 1]);
148 a84 = ((__m256i*)Branchtab)[1];
149 a85 = _mm256_xor_si256(a82, a84);
150 a86 = _mm256_avg_epu8(a79, a85);
151 a88 = _mm256_srli_epi16(a86, 2);
152 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
153 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
154 m23 = _mm256_adds_epu8(s18, t14);
155 m24 = _mm256_adds_epu8(s19, t15);
156 m25 = _mm256_adds_epu8(s18, t15);
157 m26 = _mm256_adds_epu8(s19, t14);
158 a89 = _mm256_min_epu8(m24, m23);
159 d9 = _mm256_cmpeq_epi8(a89, m24);
160 a90 = _mm256_min_epu8(m26, m25);
161 d10 = _mm256_cmpeq_epi8(a90, m26);
162 s22 = _mm256_unpacklo_epi8(d9, d10);
163 s23 = _mm256_unpackhi_epi8(d9, d10);
164 dec_int[2 * i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
165 dec_int[2 * i + 1] =
166 _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
167 s22 = _mm256_unpacklo_epi8(a89, a90);
168 s23 = _mm256_unpackhi_epi8(a89, a90);
169 ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
170 ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
171
172 // Renormalize
173 __m256i m5, m6;
174 m5 = ((__m256i*)Y)[0];
175 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
176 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
177 __m256i m7;
178 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
179 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
180 ((__m256i)m7)));
181 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
182 ((__m256i)m7)));
183 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
184 ((__m256i)m7)));
185 m7 = _mm256_unpacklo_epi8(m7, m7);
186 m7 = _mm256_shufflelo_epi16(m7, 0);
187 m6 = _mm256_unpacklo_epi64(m7, m7);
188 m6 = _mm256_permute2x128_si256(
189 m6, m6, 0); // copy lower half of m6 to upper half, since above ops
190 // operate on 128 bit lanes
191 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
192 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
193
194 // Swap pointers to old and new metrics
195 tmp = X;
196 X = Y;
197 Y = tmp;
198 }
199}
200
201#endif /*LV_HAVE_AVX2*/
202
203
204#if LV_HAVE_SSE3
205
206#include <emmintrin.h>
207#include <mmintrin.h>
208#include <pmmintrin.h>
209#include <stdio.h>
210#include <xmmintrin.h>
211
212static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
213 unsigned char* X,
214 unsigned char* syms,
215 unsigned char* dec,
216 unsigned int framebits,
217 unsigned int excess,
218 unsigned char* Branchtab)
219{
220 unsigned int i;
221 for (i = 0; i < framebits + excess; i++) {
222 unsigned char* tmp;
223 unsigned short* dec_short = (unsigned short*)dec;
224 __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
225 a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
226 m30, s18, s19, s24, s25, t14, t15, t17, t18;
227
228 // First half of butterfly
229 s18 = ((__m128i*)X)[0];
230 s19 = ((__m128i*)X)[2];
231 a76 = _mm_set1_epi8(syms[2 * i]);
232 a78 = ((__m128i*)Branchtab)[0];
233 a79 = _mm_xor_si128(a76, a78);
234 a82 = _mm_set1_epi8(syms[2 * i + 1]);
235 a84 = ((__m128i*)Branchtab)[2];
236 a85 = _mm_xor_si128(a82, a84);
237 a86 = _mm_avg_epu8(a79, a85);
238 a88 = _mm_srli_epi16(a86, 2);
239 t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
240 t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
241 m23 = _mm_adds_epu8(s18, t14);
242 m24 = _mm_adds_epu8(s19, t15);
243 m25 = _mm_adds_epu8(s18, t15);
244 m26 = _mm_adds_epu8(s19, t14);
245 a89 = _mm_min_epu8(m24, m23);
246 d9 = _mm_cmpeq_epi8(a89, m24);
247 a90 = _mm_min_epu8(m26, m25);
248 d10 = _mm_cmpeq_epi8(a90, m26);
249 dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
250 dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
251 ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
252 ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
253
254 // Second half of butterfly
255 s24 = ((__m128i*)X)[1];
256 s25 = ((__m128i*)X)[3];
257 a100 = ((__m128i*)Branchtab)[1];
258 a101 = _mm_xor_si128(a76, a100);
259 a103 = ((__m128i*)Branchtab)[3];
260 a104 = _mm_xor_si128(a82, a103);
261 a105 = _mm_avg_epu8(a101, a104);
262 a107 = _mm_srli_epi16(a105, 2);
263 t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
264 t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
265 m27 = _mm_adds_epu8(s24, t17);
266 m28 = _mm_adds_epu8(s25, t18);
267 m29 = _mm_adds_epu8(s24, t18);
268 m30 = _mm_adds_epu8(s25, t17);
269 a108 = _mm_min_epu8(m28, m27);
270 d11 = _mm_cmpeq_epi8(a108, m28);
271 a109 = _mm_min_epu8(m30, m29);
272 d12 = _mm_cmpeq_epi8(a109, m30);
273 dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
274 dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
275 ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
276 ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
277
278 // Renormalize
279 __m128i m5, m6;
280 m5 = ((__m128i*)Y)[0];
281 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
282 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
283 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
284 __m128i m7;
285 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
286 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
287 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
288 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
289 m7 = _mm_unpacklo_epi8(m7, m7);
290 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
291 m6 = _mm_unpacklo_epi64(m7, m7);
292 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
293 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
294 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
295 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
296
297 // Swap pointers to old and new metrics
298 tmp = X;
299 X = Y;
300 Y = tmp;
301 }
302}
303
304#endif /*LV_HAVE_SSE3*/
305
306#if LV_HAVE_NEON
307
308#include <arm_neon.h>
309
310static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
311 unsigned char* X,
312 unsigned char* syms,
313 unsigned char* dec,
314 unsigned int framebits,
315 unsigned int excess,
316 unsigned char* Branchtab)
317{
318 unsigned int i;
319 for (i = 0; i < framebits + excess; i++) {
320 unsigned char* tmp;
321 unsigned int* dec_int = (unsigned int*)dec;
322 uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
323 a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
324 s19, s24, s25, t14, t15, t17, t18;
325 uint16x8_t high_bits;
326 uint32x4_t paired16;
327 uint8x16_t paired32;
328 uint8x8_t left, right;
329 uint8x8x2_t both;
330
331 // First half of butterfly
332 s18 = ((uint8x16_t*)X)[0];
333 s19 = ((uint8x16_t*)X)[2];
334 a76 = vdupq_n_u8(syms[2 * i]);
335 a78 = ((uint8x16_t*)Branchtab)[0];
336 a79 = veorq_u8(a76, a78);
337 a82 = vdupq_n_u8(syms[2 * i + 1]);
338 a84 = ((uint8x16_t*)Branchtab)[2];
339 a85 = veorq_u8(a82, a84);
340 a86 = vrhaddq_u8(a79, a85);
341 t14 = vshrq_n_u8(a86, 2);
342 t15 = vqsubq_u8(vdupq_n_u8(63), t14);
343 m23 = vqaddq_u8(s18, t14);
344 m24 = vqaddq_u8(s19, t15);
345 m25 = vqaddq_u8(s18, t15);
346 m26 = vqaddq_u8(s19, t14);
347 a89 = vminq_u8(m24, m23);
348 d9 = vceqq_u8(a89, m24);
349 a90 = vminq_u8(m26, m25);
350 d10 = vceqq_u8(a90, m26);
351 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
352 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
353 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
354 dec_int[2 * i] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
355 ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
356 ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
357 ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
358 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
359 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
360 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
361 dec_int[2 * i] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
362 ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
363 ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
364 ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
365 left = vget_low_u8(a89);
366 right = vget_low_u8(a90);
367 both = vzip_u8(left, right);
368 ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
369 left = vget_high_u8(a89);
370 right = vget_high_u8(a90);
371 both = vzip_u8(left, right);
372 ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
373
374 // Second half of butterfly
375 s24 = ((uint8x16_t*)X)[1];
376 s25 = ((uint8x16_t*)X)[3];
377 a100 = ((uint8x16_t*)Branchtab)[1];
378 a101 = veorq_u8(a76, a100);
379 a103 = ((uint8x16_t*)Branchtab)[3];
380 a104 = veorq_u8(a82, a103);
381 a105 = vrhaddq_u8(a101, a104);
382 t17 = vshrq_n_u8(a105, 2);
383 t18 = vqsubq_u8(vdupq_n_u8(63), t17);
384 m27 = vqaddq_u8(s24, t17);
385 m28 = vqaddq_u8(s25, t18);
386 m29 = vqaddq_u8(s24, t18);
387 m30 = vqaddq_u8(s25, t17);
388 a108 = vminq_u8(m28, m27);
389 d11 = vceqq_u8(a108, m28);
390 a109 = vminq_u8(m30, m29);
391 d12 = vceqq_u8(a109, m30);
392 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
393 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
394 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
395 dec_int[2 * i + 1] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
396 ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
397 ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
398 ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
399 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
400 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
401 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
402 dec_int[2 * i + 1] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
403 ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
404 ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
405 ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
406 left = vget_low_u8(a108);
407 right = vget_low_u8(a109);
408 both = vzip_u8(left, right);
409 ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
410 left = vget_high_u8(a108);
411 right = vget_high_u8(a109);
412 both = vzip_u8(left, right);
413 ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
414
415 // Renormalize
416 uint8x16_t m5, m6;
417 m5 = ((uint8x16_t*)Y)[0];
418 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
419 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
420 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
421 uint8x8_t m7;
422 m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
423 m7 = vpmin_u8(m7, m7);
424 m7 = vpmin_u8(m7, m7);
425 m7 = vpmin_u8(m7, m7);
426 m6 = vcombine_u8(m7, m7);
427 ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
428 ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
429 ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
430 ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
431
432 // Swap pointers to old and new metrics
433 tmp = X;
434 X = Y;
435 Y = tmp;
436 }
437}
438
439#endif /*LV_HAVE_NEON*/
440
441#if LV_HAVE_GENERIC
442
443static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
444 unsigned char* X,
445 unsigned char* syms,
446 unsigned char* dec,
447 unsigned int framebits,
448 unsigned int excess,
449 unsigned char* Branchtab)
450{
451 int nbits = framebits + excess;
452 int NUMSTATES = 64;
453
454 int s, i;
455 for (s = 0; s < nbits; s++) {
456 void* tmp;
457 for (i = 0; i < NUMSTATES / 2; i++) {
458 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
459 }
460
461 renormalize(Y);
462
464 tmp = (void*)X;
465 X = Y;
466 Y = (unsigned char*)tmp;
467 }
468}
469
470#endif /* LV_HAVE_GENERIC */
471
472#if LV_HAVE_RVV
473#include <riscv_vector.h>
474
475static inline void volk_8u_x4_conv_k7_r2_8u_rvv(unsigned char* Y,
476 unsigned char* X,
477 unsigned char* syms,
478 unsigned char* dec,
479 unsigned int framebits,
480 unsigned int excess,
481 unsigned char* Branchtab)
482{
483 size_t vl = 256 / 8;
484
485 size_t n = framebits + excess;
486
487 if (__riscv_vlenb() == 128 / 8) {
488 vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl),
489 vX1 = __riscv_vle8_v_u8m2(X + vl, vl);
490 vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl),
491 vY1 = __riscv_vle8_v_u8m2(Y + vl, vl);
492 vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl);
493 vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl);
494 vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl);
495
496 for (size_t i = 0; i < n; ++i) {
497 // Butterfly
498 vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
499 vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
500 vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl);
501 va = __riscv_vreinterpret_u8m2(
502 __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2));
503 va = __riscv_vand(va, v63, vl);
504 vuint8m2_t vb = __riscv_vssubu(v63, va, vl);
505 vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl);
506 vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
507 vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
508 vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl);
509 vY0 = __riscv_vminu(vX1b, vX0a, vl);
510 vY1 = __riscv_vminu(vX1a, vX0b, vl);
511
512 vuint16m4_t vX1ba =
513 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
514 vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0);
515 vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1);
516
517 vuint16m4_t vm =
518 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
519 vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0);
520 vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1);
521
522 __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
523 __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
524
525 // Renormalize
526 vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl);
527 vmin = __riscv_vlmul_ext_u8m2(
528 __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl));
529 vmin = __riscv_vrgather(vmin, 0, vl);
530 vY0 = __riscv_vsub(vY0, vmin, vl);
531 vY1 = __riscv_vsub(vY1, vmin, vl);
532
533 vuint8m2_t tmp; // Swap pointers to old and new metrics
534 tmp = vX0;
535 vX0 = vY0;
536 vY0 = tmp;
537 tmp = vX1;
538 vX1 = vY1;
539 vY1 = tmp;
540 }
541 if (n & 1) {
542 __riscv_vse8(X, vY0, vl);
543 __riscv_vse8(X + vl, vY1, vl);
544 __riscv_vse8(Y, vX0, vl);
545 __riscv_vse8(Y + vl, vX1, vl);
546 } else {
547 __riscv_vse8(X, vX0, vl);
548 __riscv_vse8(X + vl, vX1, vl);
549 __riscv_vse8(Y, vY0, vl);
550 __riscv_vse8(Y + vl, vY1, vl);
551 }
552 } else if (__riscv_vlenb() == 256 / 8) {
553 vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl),
554 vX1 = __riscv_vle8_v_u8m1(X + vl, vl);
555 vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl),
556 vY1 = __riscv_vle8_v_u8m1(Y + vl, vl);
557 vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl);
558 vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl);
559 vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl);
560
561 for (size_t i = 0; i < n; ++i) {
562 // Butterfly
563 vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
564 vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
565 vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl);
566 va = __riscv_vreinterpret_u8m1(
567 __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2));
568 va = __riscv_vand(va, v63, vl);
569 vuint8m1_t vb = __riscv_vssubu(v63, va, vl);
570 vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl);
571 vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl);
572 vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl);
573 vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl);
574 vY0 = __riscv_vminu(vX1b, vX0a, vl);
575 vY1 = __riscv_vminu(vX1a, vX0b, vl);
576
577 vuint16m2_t vX1ba =
578 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
579 vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0);
580 vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1);
581
582 vuint16m2_t vm =
583 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
584 vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0);
585 vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1);
586
587 __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
588 __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
589
590 // Renormalize
591 vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl);
592 vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl);
593 vY0 = __riscv_vsub(vY0, vmin, vl);
594 vY1 = __riscv_vsub(vY1, vmin, vl);
595
596 vuint8m1_t tmp; // Swap pointers to old and new metrics
597 tmp = vX0;
598 vX0 = vY0;
599 vY0 = tmp;
600 tmp = vX1;
601 vX1 = vY1;
602 vY1 = tmp;
603 }
604 if (n & 1) {
605 __riscv_vse8(X, vY0, vl);
606 __riscv_vse8(X + vl, vY1, vl);
607 __riscv_vse8(Y, vX0, vl);
608 __riscv_vse8(Y + vl, vX1, vl);
609 } else {
610 __riscv_vse8(X, vX0, vl);
611 __riscv_vse8(X + vl, vX1, vl);
612 __riscv_vse8(Y, vY0, vl);
613 __riscv_vse8(Y + vl, vY1, vl);
614 }
615 } else {
616 vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl),
617 vX1 = __riscv_vle8_v_u8mf2(X + vl, vl);
618 vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl),
619 vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl);
620 vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl);
621 vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl);
622 vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl);
623
624 for (size_t i = 0; i < n; ++i) {
625 // Butterfly
626 vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
627 vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
628 vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl);
629 va = __riscv_vreinterpret_u8mf2(
630 __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2));
631 va = __riscv_vand(va, v63, vl);
632 vuint8mf2_t vb = __riscv_vssubu(v63, va, vl);
633 vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl);
634 vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
635 vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
636 vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl);
637 vY0 = __riscv_vminu(vX1b, vX0a, vl);
638 vY1 = __riscv_vminu(vX1a, vX0b, vl);
639
640 vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1(
641 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl));
642 vuint8m1_t vY01 = __riscv_vreinterpret_u8m1(
643 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl));
644
645 __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2);
646
647 // Renormalize
648 vuint8m1_t vmin =
649 __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2);
650 vY01 = __riscv_vsub(vY01, vmin, vl * 2);
651
652 vY0 = __riscv_vlmul_trunc_u8mf2(vY01);
653 vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl));
654
655 vuint8mf2_t tmp; // Swap pointers to old and new metrics
656 tmp = vX0;
657 vX0 = vY0;
658 vY0 = tmp;
659 tmp = vX1;
660 vX1 = vY1;
661 vY1 = tmp;
662 }
663 if (n & 1) {
664 __riscv_vse8(X, vY0, vl);
665 __riscv_vse8(X + vl, vY1, vl);
666 __riscv_vse8(Y, vX0, vl);
667 __riscv_vse8(Y + vl, vX1, vl);
668 } else {
669 __riscv_vse8(X, vX0, vl);
670 __riscv_vse8(X + vl, vX1, vl);
671 __riscv_vse8(Y, vY0, vl);
672 __riscv_vse8(Y + vl, vY1, vl);
673 }
674 }
675}
676#endif /*LV_HAVE_RVV*/
677
678#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
Definition volk_8u_x4_conv_k7_r2_8u.h:48
unsigned short s[64/16]
Definition volk_8u_x4_conv_k7_r2_8u.h:51
unsigned char t[64/8]
Definition volk_8u_x4_conv_k7_r2_8u.h:49
unsigned int w[64/32]
Definition volk_8u_x4_conv_k7_r2_8u.h:50
unsigned char c[64/8]
Definition volk_8u_x4_conv_k7_r2_8u.h:52
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:78
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:212
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:310
static void renormalize(unsigned char *X)
Definition volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:443
for i
Definition volk_config_fixed.tmpl.h:13