Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60
61#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
62#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
63#include <math.h>
65
66static inline float llr_odd(const float la, const float lb)
67{
68 const float ala = fabsf(la);
69 const float alb = fabsf(lb);
70 return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
71}
72
73static inline void llr_odd_stages(
74 float* llrs, int min_stage, const int depth, const int frame_size, const int row)
75{
76 int loop_stage = depth - 1;
77 float* dst_llr_ptr;
78 float* src_llr_ptr;
79 int stage_size = 0x01 << loop_stage;
80
81 int el;
82 while (min_stage <= loop_stage) {
83 dst_llr_ptr = llrs + loop_stage * frame_size + row;
84 src_llr_ptr = dst_llr_ptr + frame_size;
85 for (el = 0; el < stage_size; el++) {
86 *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
87 src_llr_ptr += 2;
88 }
89
90 --loop_stage;
91 stage_size >>= 1;
92 }
93}
94
95static inline float llr_even(const float la, const float lb, const unsigned char f)
96{
97 switch (f) {
98 case 0:
99 return lb + la;
100 default:
101 return lb - la;
102 }
103}
104
105static inline void
106even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
107{
108 u++;
109 int i;
110 for (i = 1; i < u_num; i += 2) {
111 *u_even++ = *u;
112 u += 2;
113 }
114}
115
116static inline void
117odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
118{
119 int i;
120 for (i = 1; i < u_num; i += 2) {
121 *u_xor++ = *u ^ *(u + 1);
122 u += 2;
123 }
124}
125
126static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
127{
128 int max_stage_depth = 0;
129 int half_stage_size = 0x01;
130 int stage_size = half_stage_size << 1;
131 while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
132 if (!(row % stage_size < half_stage_size)) {
133 break;
134 }
135 half_stage_size <<= 1;
136 stage_size <<= 1;
137 max_stage_depth++;
138 }
139 return max_stage_depth;
140}
141
142#ifdef LV_HAVE_GENERIC
143
144static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
145 unsigned char* u,
146 const int frame_exp,
147 const int stage,
148 const int u_num,
149 const int row)
150{
151 const int frame_size = 0x01 << frame_exp;
152 const int next_stage = stage + 1;
153
154 const int half_stage_size = 0x01 << stage;
155 const int stage_size = half_stage_size << 1;
156
157 const bool is_upper_stage_half = row % stage_size < half_stage_size;
158
159 // // this is a natural bit order impl
160 float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
161 float* call_row_llr = llrs + row;
162
163 const int section = row - (row % stage_size);
164 const int jump_size = ((row % half_stage_size) << 1) % stage_size;
165
166 const int next_upper_row = section + jump_size;
167 const int next_lower_row = next_upper_row + 1;
168
169 const float* upper_right_llr_ptr = next_llrs + next_upper_row;
170 const float* lower_right_llr_ptr = next_llrs + next_lower_row;
171
172 if (!is_upper_stage_half) {
173 const int u_pos = u_num >> stage;
174 const unsigned char f = u[u_pos - 1];
175 *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
176 return;
177 }
178
179 if (frame_exp > next_stage) {
180 unsigned char* u_half = u + frame_size;
181 odd_xor_even_values(u_half, u, u_num);
183 next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
184
185 even_u_values(u_half, u, u_num);
187 next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
188 }
189
190 *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
191}
192
193#endif /* LV_HAVE_GENERIC */
194
195
196#ifdef LV_HAVE_AVX
197#include <immintrin.h>
199
200static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
201 unsigned char* u,
202 const int frame_exp,
203 const int stage,
204 const int u_num,
205 const int row)
206{
207 const int frame_size = 0x01 << frame_exp;
208 if (row % 2) { // for odd rows just do the only necessary calculation and return.
209 const float* next_llrs = llrs + frame_size + row;
210 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
211 return;
212 }
213
214 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
215 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
216 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
217 return;
218 }
219
220 int loop_stage = max_stage_depth;
221 int stage_size = 0x01 << loop_stage;
222
223 float* src_llr_ptr;
224 float* dst_llr_ptr;
225
226 __m256 src0, src1, dst;
227
228 if (row) { // not necessary for ZERO row. == first bit to be decoded.
229 // first do bit combination for all stages
230 // effectively encode some decoded bits again.
231 unsigned char* u_target = u + frame_size;
232 unsigned char* u_temp = u + 2 * frame_size;
233 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
234
235 volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
236
237 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
238 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
239
240 __m128i fbits;
241
242 int p;
243 for (p = 0; p < stage_size; p += 8) {
244 fbits = _mm_loadu_si128((__m128i*)u_target);
245 u_target += 8;
246
247 src0 = _mm256_loadu_ps(src_llr_ptr);
248 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
249 src_llr_ptr += 16;
250
251 dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
252
253 _mm256_storeu_ps(dst_llr_ptr, dst);
254 dst_llr_ptr += 8;
255 }
256
257 --loop_stage;
258 stage_size >>= 1;
259 }
260
261 const int min_stage = stage > 2 ? stage : 2;
262
263 _mm256_zeroall(); // Important to clear cache!
264
265 int el;
266 while (min_stage < loop_stage) {
267 dst_llr_ptr = llrs + loop_stage * frame_size + row;
268 src_llr_ptr = dst_llr_ptr + frame_size;
269 for (el = 0; el < stage_size; el += 8) {
270 src0 = _mm256_loadu_ps(src_llr_ptr);
271 src_llr_ptr += 8;
272 src1 = _mm256_loadu_ps(src_llr_ptr);
273 src_llr_ptr += 8;
274
275 dst = _mm256_polar_minsum_llrs(src0, src1);
276
277 _mm256_storeu_ps(dst_llr_ptr, dst);
278 dst_llr_ptr += 8;
279 }
280
281 --loop_stage;
282 stage_size >>= 1;
283 }
284
285 // for stages < 3 vectors are too small!.
286 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
287}
288
289#endif /* LV_HAVE_AVX */
290
291#ifdef LV_HAVE_AVX2
292#include <immintrin.h>
294
295static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
296 unsigned char* u,
297 const int frame_exp,
298 const int stage,
299 const int u_num,
300 const int row)
301{
302 const int frame_size = 0x01 << frame_exp;
303 if (row % 2) { // for odd rows just do the only necessary calculation and return.
304 const float* next_llrs = llrs + frame_size + row;
305 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
306 return;
307 }
308
309 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
310 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
311 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
312 return;
313 }
314
315 int loop_stage = max_stage_depth;
316 int stage_size = 0x01 << loop_stage;
317
318 float* src_llr_ptr;
319 float* dst_llr_ptr;
320
321 __m256 src0, src1, dst;
322
323 if (row) { // not necessary for ZERO row. == first bit to be decoded.
324 // first do bit combination for all stages
325 // effectively encode some decoded bits again.
326 unsigned char* u_target = u + frame_size;
327 unsigned char* u_temp = u + 2 * frame_size;
328 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
329
330 volk_8u_x2_encodeframepolar_8u_u_avx2(u_target, u_temp, stage_size);
331
332 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
333 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
334
335 __m128i fbits;
336
337 int p;
338 for (p = 0; p < stage_size; p += 8) {
339 fbits = _mm_loadu_si128((__m128i*)u_target);
340 u_target += 8;
341
342 src0 = _mm256_loadu_ps(src_llr_ptr);
343 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
344 src_llr_ptr += 16;
345
346 dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
347
348 _mm256_storeu_ps(dst_llr_ptr, dst);
349 dst_llr_ptr += 8;
350 }
351
352 --loop_stage;
353 stage_size >>= 1;
354 }
355
356 const int min_stage = stage > 2 ? stage : 2;
357
358 _mm256_zeroall(); // Important to clear cache!
359
360 int el;
361 while (min_stage < loop_stage) {
362 dst_llr_ptr = llrs + loop_stage * frame_size + row;
363 src_llr_ptr = dst_llr_ptr + frame_size;
364 for (el = 0; el < stage_size; el += 8) {
365 src0 = _mm256_loadu_ps(src_llr_ptr);
366 src_llr_ptr += 8;
367 src1 = _mm256_loadu_ps(src_llr_ptr);
368 src_llr_ptr += 8;
369
370 dst = _mm256_polar_minsum_llrs(src0, src1);
371
372 _mm256_storeu_ps(dst_llr_ptr, dst);
373 dst_llr_ptr += 8;
374 }
375
376 --loop_stage;
377 stage_size >>= 1;
378 }
379
380 // for stages < 3 vectors are too small!.
381 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
382}
383
384#endif /* LV_HAVE_AVX2 */
385
386#ifdef LV_HAVE_RVV
387#include <riscv_vector.h>
388
389static inline void volk_32f_8u_polarbutterfly_32f_rvv(float* llrs,
390 unsigned char* u,
391 const int frame_exp,
392 const int stage,
393 const int u_num,
394 const int row)
395{
396 const int frame_size = 0x01 << frame_exp;
397 if (row % 2) { // for odd rows just do the only necessary calculation and return.
398 const float* next_llrs = llrs + frame_size + row;
399 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
400 return;
401 }
402
403 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
404 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
405 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
406 return;
407 }
408
409 int loop_stage = max_stage_depth;
410 int stage_size = 0x01 << loop_stage;
411
412 float* src_llr_ptr;
413 float* dst_llr_ptr;
414
415 if (row) { // not necessary for ZERO row. == first bit to be decoded.
416 // first do bit combination for all stages
417 // effectively encode some decoded bits again.
418 unsigned char* u_target = u + frame_size;
419 unsigned char* u_temp = u + 2 * frame_size;
420 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
421
422 volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size);
423
424 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
425 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
426
427 size_t n = stage_size;
428 for (size_t vl; n > 0;
429 n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
430 vl = __riscv_vsetvl_e32m1(n);
431 vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl);
432 vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl);
433 vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl));
434 vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl));
435 llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl);
436 llr0 = __riscv_vfadd(llr0, llr1, vl);
437 __riscv_vse32(dst_llr_ptr, llr0, vl);
438 }
439
440 --loop_stage;
441 stage_size >>= 1;
442 }
443
444 const int min_stage = stage > 2 ? stage : 2;
445
446 while (min_stage < loop_stage) {
447 dst_llr_ptr = llrs + loop_stage * frame_size + row;
448 src_llr_ptr = dst_llr_ptr + frame_size;
449
450 size_t n = stage_size;
451 for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
452 vl = __riscv_vsetvl_e32m1(n);
453 vuint64m2_t llr = __riscv_vle64_v_u64m2((const uint64_t*)src_llr_ptr, vl);
454 vfloat32m1_t llr0 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 0, vl));
455 vfloat32m1_t llr1 = __riscv_vreinterpret_f32m1(__riscv_vnsrl(llr, 32, vl));
456 vfloat32m1_t v =
457 __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl);
458 v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl);
459 __riscv_vse32(dst_llr_ptr, v, vl);
460 }
461
462 --loop_stage;
463 stage_size >>= 1;
464 }
465
466 // for stages < 3 vectors are too small!.
467 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
468}
469#endif /* LV_HAVE_RVV */
470
471#ifdef LV_HAVE_RVVSEG
472#include <riscv_vector.h>
473
474static inline void volk_32f_8u_polarbutterfly_32f_rvvseg(float* llrs,
475 unsigned char* u,
476 const int frame_exp,
477 const int stage,
478 const int u_num,
479 const int row)
480{
481 const int frame_size = 0x01 << frame_exp;
482 if (row % 2) { // for odd rows just do the only necessary calculation and return.
483 const float* next_llrs = llrs + frame_size + row;
484 *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
485 return;
486 }
487
488 const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
489 if (max_stage_depth < 3) { // vectorized version needs larger vectors.
490 volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
491 return;
492 }
493
494 int loop_stage = max_stage_depth;
495 int stage_size = 0x01 << loop_stage;
496
497 float* src_llr_ptr;
498 float* dst_llr_ptr;
499
500 if (row) { // not necessary for ZERO row. == first bit to be decoded.
501 // first do bit combination for all stages
502 // effectively encode some decoded bits again.
503 unsigned char* u_target = u + frame_size;
504 unsigned char* u_temp = u + 2 * frame_size;
505 memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
506
507 volk_8u_x2_encodeframepolar_8u_rvv(u_target, u_temp, stage_size);
508
509 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
510 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
511
512 size_t n = stage_size;
513 for (size_t vl; n > 0;
514 n -= vl, u_target += vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
515 vl = __riscv_vsetvl_e32m1(n);
516 vint8mf4_t v = __riscv_vle8_v_i8mf4((int8_t*)u_target, vl);
517 vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl);
518 vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0);
519 vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1);
520 llr0 = __riscv_vfneg_mu(__riscv_vmslt(v, 0, vl), llr0, llr0, vl);
521 llr0 = __riscv_vfadd(llr0, llr1, vl);
522 __riscv_vse32(dst_llr_ptr, llr0, vl);
523 }
524
525 --loop_stage;
526 stage_size >>= 1;
527 }
528
529 const int min_stage = stage > 2 ? stage : 2;
530
531 while (min_stage < loop_stage) {
532 dst_llr_ptr = llrs + loop_stage * frame_size + row;
533 src_llr_ptr = dst_llr_ptr + frame_size;
534
535 size_t n = stage_size;
536 for (size_t vl; n > 0; n -= vl, src_llr_ptr += vl * 2, dst_llr_ptr += vl) {
537 vl = __riscv_vsetvl_e32m1(n);
538 vfloat32m1x2_t llr = __riscv_vlseg2e32_v_f32m1x2(src_llr_ptr, vl);
539 vfloat32m1_t llr0 = __riscv_vget_f32m1(llr, 0);
540 vfloat32m1_t llr1 = __riscv_vget_f32m1(llr, 1);
541 vfloat32m1_t v =
542 __riscv_vfmin(__riscv_vfabs(llr0, vl), __riscv_vfabs(llr1, vl), vl);
543 v = __riscv_vfsgnjx(__riscv_vfsgnj(v, llr0, vl), llr1, vl);
544 __riscv_vse32(dst_llr_ptr, v, vl);
545 }
546
547 --loop_stage;
548 stage_size >>= 1;
549 }
550
551 // for stages < 3 vectors are too small!.
552 llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
553}
554#endif /* LV_HAVE_RVVSEG */
555
556#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
static float llr_even(const float la, const float lb, const unsigned char f)
Definition volk_32f_8u_polarbutterfly_32f.h:95
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition volk_32f_8u_polarbutterfly_32f.h:73
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition volk_32f_8u_polarbutterfly_32f.h:117
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition volk_32f_8u_polarbutterfly_32f.h:144
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition volk_32f_8u_polarbutterfly_32f.h:200
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition volk_32f_8u_polarbutterfly_32f.h:106
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition volk_32f_8u_polarbutterfly_32f.h:126
static float llr_odd(const float la, const float lb)
Definition volk_32f_8u_polarbutterfly_32f.h:66
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:76
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition volk_avx2_intrinsics.h:83
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:192
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition volk_avx_intrinsics.h:209
for i
Definition volk_config_fixed.tmpl.h:13