45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
49 unsigned char t[64 / 8 ];
50 unsigned int w[64 / 32];
51 unsigned short s[64 / 16];
52 unsigned char c[64 / 8];
65 unsigned char min = X[0];
66 for (
i = 0;
i < NUMSTATES;
i++) {
71 for (
i = 0;
i < NUMSTATES;
i++) {
84 unsigned char* Branchtab)
87 unsigned int decision0, decision1;
88 unsigned char metric, m0, m1, m2, m3;
89 unsigned short metricsum;
94 int PRECISIONSHIFT = 2;
97 for (j = 0; j < RATE; j++) {
98 metricsum += (Branchtab[
i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
100 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
102 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
105 m1 = X[
i + NUMSTATES / 2] + (max - metric);
106 m2 = X[
i] + (max - metric);
107 m3 = X[
i + NUMSTATES / 2] + metric;
109 decision0 = (
signed int)(m0 - m1) >= 0;
110 decision1 = (
signed int)(m2 - m3) >= 0;
112 Y[2 *
i] = decision0 ? m1 : m0;
113 Y[2 *
i + 1] = decision1 ? m3 : m2;
115 d->
w[
i / (
sizeof(
unsigned int) * 8 / 2) +
116 s * (
sizeof(
decision_t) /
sizeof(
unsigned int))] |=
117 (decision0 | decision1 << 1) << ((2 *
i) & (
sizeof(
unsigned int) * 8 - 1));
123#include <immintrin.h>
126static inline void volk_8u_x4_conv_k7_r2_8u_avx2(
unsigned char* Y,
130 unsigned int framebits,
132 unsigned char* Branchtab)
135 for (
i = 0;
i < framebits + excess;
i++) {
137 unsigned int* dec_int = (
unsigned int*)dec;
138 __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
139 m26, s18, s19, s22, s23, t14, t15;
142 s18 = ((__m256i*)X)[0];
143 s19 = ((__m256i*)X)[1];
144 a76 = _mm256_set1_epi8(syms[2 *
i]);
145 a78 = ((__m256i*)Branchtab)[0];
146 a79 = _mm256_xor_si256(a76, a78);
147 a82 = _mm256_set1_epi8(syms[2 *
i + 1]);
148 a84 = ((__m256i*)Branchtab)[1];
149 a85 = _mm256_xor_si256(a82, a84);
150 a86 = _mm256_avg_epu8(a79, a85);
151 a88 = _mm256_srli_epi16(a86, 2);
152 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
153 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
154 m23 = _mm256_adds_epu8(s18, t14);
155 m24 = _mm256_adds_epu8(s19, t15);
156 m25 = _mm256_adds_epu8(s18, t15);
157 m26 = _mm256_adds_epu8(s19, t14);
158 a89 = _mm256_min_epu8(m24, m23);
159 d9 = _mm256_cmpeq_epi8(a89, m24);
160 a90 = _mm256_min_epu8(m26, m25);
161 d10 = _mm256_cmpeq_epi8(a90, m26);
162 s22 = _mm256_unpacklo_epi8(d9, d10);
163 s23 = _mm256_unpackhi_epi8(d9, d10);
164 dec_int[2 *
i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
166 _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
167 s22 = _mm256_unpacklo_epi8(a89, a90);
168 s23 = _mm256_unpackhi_epi8(a89, a90);
169 ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
170 ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
174 m5 = ((__m256i*)Y)[0];
175 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
176 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
178 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
179 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
181 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
183 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
185 m7 = _mm256_unpacklo_epi8(m7, m7);
186 m7 = _mm256_shufflelo_epi16(m7, 0);
187 m6 = _mm256_unpacklo_epi64(m7, m7);
188 m6 = _mm256_permute2x128_si256(
191 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
192 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
206#include <emmintrin.h>
208#include <pmmintrin.h>
210#include <xmmintrin.h>
216 unsigned int framebits,
218 unsigned char* Branchtab)
221 for (
i = 0;
i < framebits + excess;
i++) {
223 unsigned short* dec_short = (
unsigned short*)dec;
224 __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
225 a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
226 m30, s18, s19, s24, s25, t14, t15, t17, t18;
229 s18 = ((__m128i*)X)[0];
230 s19 = ((__m128i*)X)[2];
231 a76 = _mm_set1_epi8(syms[2 *
i]);
232 a78 = ((__m128i*)Branchtab)[0];
233 a79 = _mm_xor_si128(a76, a78);
234 a82 = _mm_set1_epi8(syms[2 *
i + 1]);
235 a84 = ((__m128i*)Branchtab)[2];
236 a85 = _mm_xor_si128(a82, a84);
237 a86 = _mm_avg_epu8(a79, a85);
238 a88 = _mm_srli_epi16(a86, 2);
239 t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
240 t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
241 m23 = _mm_adds_epu8(s18, t14);
242 m24 = _mm_adds_epu8(s19, t15);
243 m25 = _mm_adds_epu8(s18, t15);
244 m26 = _mm_adds_epu8(s19, t14);
245 a89 = _mm_min_epu8(m24, m23);
246 d9 = _mm_cmpeq_epi8(a89, m24);
247 a90 = _mm_min_epu8(m26, m25);
248 d10 = _mm_cmpeq_epi8(a90, m26);
249 dec_short[4 *
i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
250 dec_short[4 *
i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
251 ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
252 ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
255 s24 = ((__m128i*)X)[1];
256 s25 = ((__m128i*)X)[3];
257 a100 = ((__m128i*)Branchtab)[1];
258 a101 = _mm_xor_si128(a76, a100);
259 a103 = ((__m128i*)Branchtab)[3];
260 a104 = _mm_xor_si128(a82, a103);
261 a105 = _mm_avg_epu8(a101, a104);
262 a107 = _mm_srli_epi16(a105, 2);
263 t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
264 t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
265 m27 = _mm_adds_epu8(s24, t17);
266 m28 = _mm_adds_epu8(s25, t18);
267 m29 = _mm_adds_epu8(s24, t18);
268 m30 = _mm_adds_epu8(s25, t17);
269 a108 = _mm_min_epu8(m28, m27);
270 d11 = _mm_cmpeq_epi8(a108, m28);
271 a109 = _mm_min_epu8(m30, m29);
272 d12 = _mm_cmpeq_epi8(a109, m30);
273 dec_short[4 *
i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
274 dec_short[4 *
i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
275 ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
276 ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
280 m5 = ((__m128i*)Y)[0];
281 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
282 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
283 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
285 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
286 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
287 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
288 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
289 m7 = _mm_unpacklo_epi8(m7, m7);
290 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
291 m6 = _mm_unpacklo_epi64(m7, m7);
292 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
293 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
294 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
295 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
314 unsigned int framebits,
316 unsigned char* Branchtab)
319 for (
i = 0;
i < framebits + excess;
i++) {
321 unsigned int* dec_int = (
unsigned int*)dec;
322 uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
323 a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
324 s19, s24, s25, t14, t15, t17, t18;
325 uint16x8_t high_bits;
328 uint8x8_t left, right;
332 s18 = ((uint8x16_t*)X)[0];
333 s19 = ((uint8x16_t*)X)[2];
334 a76 = vdupq_n_u8(syms[2 *
i]);
335 a78 = ((uint8x16_t*)Branchtab)[0];
336 a79 = veorq_u8(a76, a78);
337 a82 = vdupq_n_u8(syms[2 *
i + 1]);
338 a84 = ((uint8x16_t*)Branchtab)[2];
339 a85 = veorq_u8(a82, a84);
340 a86 = vrhaddq_u8(a79, a85);
341 t14 = vshrq_n_u8(a86, 2);
342 t15 = vqsubq_u8(vdupq_n_u8(63), t14);
343 m23 = vqaddq_u8(s18, t14);
344 m24 = vqaddq_u8(s19, t15);
345 m25 = vqaddq_u8(s18, t15);
346 m26 = vqaddq_u8(s19, t14);
347 a89 = vminq_u8(m24, m23);
348 d9 = vceqq_u8(a89, m24);
349 a90 = vminq_u8(m26, m25);
350 d10 = vceqq_u8(a90, m26);
351 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
352 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
353 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
354 dec_int[2 *
i] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
355 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
356 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
357 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
358 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
359 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
360 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
361 dec_int[2 *
i] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
362 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
363 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
364 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
365 left = vget_low_u8(a89);
366 right = vget_low_u8(a90);
367 both = vzip_u8(left, right);
368 ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
369 left = vget_high_u8(a89);
370 right = vget_high_u8(a90);
371 both = vzip_u8(left, right);
372 ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
375 s24 = ((uint8x16_t*)X)[1];
376 s25 = ((uint8x16_t*)X)[3];
377 a100 = ((uint8x16_t*)Branchtab)[1];
378 a101 = veorq_u8(a76, a100);
379 a103 = ((uint8x16_t*)Branchtab)[3];
380 a104 = veorq_u8(a82, a103);
381 a105 = vrhaddq_u8(a101, a104);
382 t17 = vshrq_n_u8(a105, 2);
383 t18 = vqsubq_u8(vdupq_n_u8(63), t17);
384 m27 = vqaddq_u8(s24, t17);
385 m28 = vqaddq_u8(s25, t18);
386 m29 = vqaddq_u8(s24, t18);
387 m30 = vqaddq_u8(s25, t17);
388 a108 = vminq_u8(m28, m27);
389 d11 = vceqq_u8(a108, m28);
390 a109 = vminq_u8(m30, m29);
391 d12 = vceqq_u8(a109, m30);
392 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
393 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
394 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
395 dec_int[2 *
i + 1] = ((
unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
396 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
397 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
398 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 24);
399 high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
400 paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
401 paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
402 dec_int[2 *
i + 1] |= ((
unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
403 ((
unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
404 ((
unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
405 ((
unsigned int)vgetq_lane_u8(paired32, 12) << 25);
406 left = vget_low_u8(a108);
407 right = vget_low_u8(a109);
408 both = vzip_u8(left, right);
409 ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
410 left = vget_high_u8(a108);
411 right = vget_high_u8(a109);
412 both = vzip_u8(left, right);
413 ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
417 m5 = ((uint8x16_t*)Y)[0];
418 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
419 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
420 m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
422 m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
423 m7 = vpmin_u8(m7, m7);
424 m7 = vpmin_u8(m7, m7);
425 m7 = vpmin_u8(m7, m7);
426 m6 = vcombine_u8(m7, m7);
427 ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
428 ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
429 ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
430 ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
447 unsigned int framebits,
449 unsigned char* Branchtab)
451 int nbits = framebits + excess;
455 for (s = 0; s < nbits; s++) {
457 for (
i = 0;
i < NUMSTATES / 2;
i++) {
466 Y = (
unsigned char*)tmp;
473#include <riscv_vector.h>
475static inline void volk_8u_x4_conv_k7_r2_8u_rvv(
unsigned char* Y,
479 unsigned int framebits,
481 unsigned char* Branchtab)
485 size_t n = framebits + excess;
487 if (__riscv_vlenb() == 128 / 8) {
488 vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl),
489 vX1 = __riscv_vle8_v_u8m2(X + vl, vl);
490 vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl),
491 vY1 = __riscv_vle8_v_u8m2(Y + vl, vl);
492 vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl);
493 vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl);
494 vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl);
496 for (
size_t i = 0;
i < n; ++
i) {
498 vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 *
i + 0], vl);
499 vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 *
i + 1], vl);
500 vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl);
501 va = __riscv_vreinterpret_u8m2(
502 __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2));
503 va = __riscv_vand(va, v63, vl);
504 vuint8m2_t vb = __riscv_vssubu(v63, va, vl);
505 vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl);
506 vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
507 vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
508 vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl);
509 vY0 = __riscv_vminu(vX1b, vX0a, vl);
510 vY1 = __riscv_vminu(vX1a, vX0b, vl);
513 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
514 vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0);
515 vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1);
518 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
519 vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0);
520 vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1);
522 __riscv_vsm(&dec[8 *
i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
523 __riscv_vsm(&dec[8 *
i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
526 vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl);
527 vmin = __riscv_vlmul_ext_u8m2(
528 __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl));
529 vmin = __riscv_vrgather(vmin, 0, vl);
530 vY0 = __riscv_vsub(vY0, vmin, vl);
531 vY1 = __riscv_vsub(vY1, vmin, vl);
542 __riscv_vse8(X, vY0, vl);
543 __riscv_vse8(X + vl, vY1, vl);
544 __riscv_vse8(Y, vX0, vl);
545 __riscv_vse8(Y + vl, vX1, vl);
547 __riscv_vse8(X, vX0, vl);
548 __riscv_vse8(X + vl, vX1, vl);
549 __riscv_vse8(Y, vY0, vl);
550 __riscv_vse8(Y + vl, vY1, vl);
552 }
else if (__riscv_vlenb() == 256 / 8) {
553 vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl),
554 vX1 = __riscv_vle8_v_u8m1(X + vl, vl);
555 vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl),
556 vY1 = __riscv_vle8_v_u8m1(Y + vl, vl);
557 vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl);
558 vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl);
559 vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl);
561 for (
size_t i = 0;
i < n; ++
i) {
563 vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 *
i + 0], vl);
564 vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 *
i + 1], vl);
565 vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl);
566 va = __riscv_vreinterpret_u8m1(
567 __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2));
568 va = __riscv_vand(va, v63, vl);
569 vuint8m1_t vb = __riscv_vssubu(v63, va, vl);
570 vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl);
571 vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl);
572 vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl);
573 vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl);
574 vY0 = __riscv_vminu(vX1b, vX0a, vl);
575 vY1 = __riscv_vminu(vX1a, vX0b, vl);
578 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
579 vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0);
580 vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1);
583 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
584 vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0);
585 vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1);
587 __riscv_vsm(&dec[8 *
i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
588 __riscv_vsm(&dec[8 *
i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
591 vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl);
592 vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl);
593 vY0 = __riscv_vsub(vY0, vmin, vl);
594 vY1 = __riscv_vsub(vY1, vmin, vl);
605 __riscv_vse8(X, vY0, vl);
606 __riscv_vse8(X + vl, vY1, vl);
607 __riscv_vse8(Y, vX0, vl);
608 __riscv_vse8(Y + vl, vX1, vl);
610 __riscv_vse8(X, vX0, vl);
611 __riscv_vse8(X + vl, vX1, vl);
612 __riscv_vse8(Y, vY0, vl);
613 __riscv_vse8(Y + vl, vY1, vl);
616 vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl),
617 vX1 = __riscv_vle8_v_u8mf2(X + vl, vl);
618 vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl),
619 vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl);
620 vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl);
621 vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl);
622 vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl);
624 for (
size_t i = 0;
i < n; ++
i) {
626 vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 *
i + 0], vl);
627 vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 *
i + 1], vl);
628 vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl);
629 va = __riscv_vreinterpret_u8mf2(
630 __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2));
631 va = __riscv_vand(va, v63, vl);
632 vuint8mf2_t vb = __riscv_vssubu(v63, va, vl);
633 vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl);
634 vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
635 vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
636 vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl);
637 vY0 = __riscv_vminu(vX1b, vX0a, vl);
638 vY1 = __riscv_vminu(vX1a, vX0b, vl);
640 vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1(
641 __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl));
642 vuint8m1_t vY01 = __riscv_vreinterpret_u8m1(
643 __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl));
645 __riscv_vsm(&dec[8 *
i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2);
649 __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2);
650 vY01 = __riscv_vsub(vY01, vmin, vl * 2);
652 vY0 = __riscv_vlmul_trunc_u8mf2(vY01);
653 vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl));
664 __riscv_vse8(X, vY0, vl);
665 __riscv_vse8(X + vl, vY1, vl);
666 __riscv_vse8(Y, vX0, vl);
667 __riscv_vse8(Y + vl, vX1, vl);
669 __riscv_vse8(X, vX0, vl);
670 __riscv_vse8(X + vl, vX1, vl);
671 __riscv_vse8(Y, vY0, vl);
672 __riscv_vse8(Y + vl, vY1, vl);
Definition volk_8u_x4_conv_k7_r2_8u.h:48
unsigned short s[64/16]
Definition volk_8u_x4_conv_k7_r2_8u.h:51
unsigned char t[64/8]
Definition volk_8u_x4_conv_k7_r2_8u.h:49
unsigned int w[64/32]
Definition volk_8u_x4_conv_k7_r2_8u.h:50
unsigned char c[64/8]
Definition volk_8u_x4_conv_k7_r2_8u.h:52
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:78
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:212
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:310
static void renormalize(unsigned char *X)
Definition volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition volk_8u_x4_conv_k7_r2_8u.h:443
for i
Definition volk_config_fixed.tmpl.h:13