16#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
29 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
30 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
31 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
32 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
33 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
34 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
35 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
37 const __m256 x_times_x = _mm256_mul_ps(x, x);
40 arctan = _mm256_mul_ps(x_times_x, arctan);
41 arctan = _mm256_add_ps(arctan, a11);
42 arctan = _mm256_mul_ps(x_times_x, arctan);
43 arctan = _mm256_add_ps(arctan, a9);
44 arctan = _mm256_mul_ps(x_times_x, arctan);
45 arctan = _mm256_add_ps(arctan, a7);
46 arctan = _mm256_mul_ps(x_times_x, arctan);
47 arctan = _mm256_add_ps(arctan, a5);
48 arctan = _mm256_mul_ps(x_times_x, arctan);
49 arctan = _mm256_add_ps(arctan, a3);
50 arctan = _mm256_mul_ps(x_times_x, arctan);
51 arctan = _mm256_add_ps(arctan, a1);
52 arctan = _mm256_mul_ps(x, arctan);
59 __m256 yl, yh, tmp1, tmp2;
60 yl = _mm256_moveldup_ps(y);
61 yh = _mm256_movehdup_ps(y);
62 tmp1 = _mm256_mul_ps(x, yl);
63 x = _mm256_shuffle_ps(x, x, 0xB1);
64 tmp2 = _mm256_mul_ps(x, yh);
67 return _mm256_addsub_ps(tmp1, tmp2);
72 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
73 return _mm256_xor_ps(x, conjugator);
78 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
79 const __m256 dreal = _mm256_moveldup_ps(y);
80 const __m256 dimag = _mm256_movehdup_ps(y);
82 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
83 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
84 const __m256 multreal = _mm256_mul_ps(x, dreal);
85 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
86 return _mm256_add_ps(multreal, multimag);
91 __m256 tmp1 = _mm256_mul_ps(val, val);
92 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
93 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0));
94 tmp1 = _mm256_sqrt_ps(tmp1);
95 return _mm256_div_ps(val, tmp1);
100 __m256 complex1, complex2;
101 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
102 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
103 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
104 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
105 return _mm256_hadd_ps(complex1, complex2);
114 const __m256 symbols1,
115 const __m256 points0,
116 const __m256 points1,
124 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
125 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
127 return _mm256_mul_ps(norms, scalar);
132 __m256 sign_mask_dummy = _mm256_setzero_ps();
133 const __m128i zeros = _mm_set1_epi8(0x00);
134 const __m128i sign_extract = _mm_set1_epi8(0x80);
135 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
151 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
168 fbits = _mm_cmpgt_epi8(fbits, zeros);
169 fbits = _mm_and_si128(fbits, sign_extract);
170 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
171 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
174 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
175 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
186 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
187 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
188 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
189 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
194 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
195 const __m256 abs_mask =
196 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
203 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
205 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
206 return _mm256_or_ps(dst, sign);
218 llr0 = _mm256_xor_ps(llr0, sign_mask);
219 __m256 dst = _mm256_add_ps(llr0, llr1);
224 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
226 aux = _mm256_mul_ps(aux, val);
227 aux = _mm256_sub_ps(aux, acc);
228 aux = _mm256_mul_ps(aux, aux);
229 aux = _mm256_mul_ps(aux, rec);
230 return _mm256_add_ps(sq_acc, aux);
static __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:98
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:108
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:183
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition volk_avx_intrinsics.h:76
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition volk_avx_intrinsics.h:223
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:57
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition volk_avx_intrinsics.h:192
static __m256 _mm256_conjugate_ps(__m256 x)
Definition volk_avx_intrinsics.h:70
static __m256 _mm256_normalize_ps(__m256 val)
Definition volk_avx_intrinsics.h:89
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx_intrinsics.h:113
static __m256 _mm256_polar_sign_mask(__m128i fbits)
Definition volk_avx_intrinsics.h:130
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition volk_avx_intrinsics.h:209