61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
69 const float normalizeFactor,
70 unsigned int num_points)
72 float* outPtr = outputVector;
73 const float* inPtr = (
float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75 unsigned int number = 0;
76 for (; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
88 const float normalizeFactor,
89 unsigned int num_points)
91 float* outPtr = outputVector;
92 const float* inPtr = (
float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94 unsigned int number = 0;
95 for (; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ =
volk_atan2(y, x) * invNormalizeFactor;
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(
float* outputVector,
108 const float normalizeFactor,
109 unsigned int num_points)
111 const float* in = (
float*)complexVector;
112 float* out = (
float*)outputVector;
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120 const __m256 zero = _mm256_setzero_ps();
122 unsigned int number = 0;
123 unsigned int eighth_points = num_points / 8;
124 for (; number < eighth_points; number++) {
125 __m256 z1 = _mm256_load_ps(in);
127 __m256 z2 = _mm256_load_ps(in);
133 __m256 swap_mask = _mm256_cmp_ps(
134 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
135 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
136 _mm256_blendv_ps(x, y, swap_mask));
137 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
138 input = _mm256_blendv_ps(input, zero, nan_mask);
142 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
143 result = _mm256_blendv_ps(result, input, swap_mask);
146 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
148 result = _mm256_add_ps(
149 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
151 result = _mm256_mul_ps(result, vinvNormalizeFactor);
153 _mm256_store_ps(out, result);
157 number = eighth_points * 8;
159 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
164#include <immintrin.h>
166static inline void volk_32fc_s32f_atan2_32f_a_avx2(
float* outputVector,
168 const float normalizeFactor,
169 unsigned int num_points)
171 const float* in = (
float*)complexVector;
172 float* out = (
float*)outputVector;
174 const float invNormalizeFactor = 1.f / normalizeFactor;
175 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
176 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
177 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180 const __m256 zero = _mm256_setzero_ps();
182 unsigned int number = 0;
183 unsigned int eighth_points = num_points / 8;
184 for (; number < eighth_points; number++) {
185 __m256 z1 = _mm256_load_ps(in);
187 __m256 z2 = _mm256_load_ps(in);
193 __m256 swap_mask = _mm256_cmp_ps(
194 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
195 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
196 _mm256_blendv_ps(x, y, swap_mask));
197 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
198 input = _mm256_blendv_ps(input, zero, nan_mask);
202 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
203 result = _mm256_blendv_ps(result, input, swap_mask);
206 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
208 result = _mm256_add_ps(
209 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
211 result = _mm256_mul_ps(result, vinvNormalizeFactor);
213 _mm256_store_ps(out, result);
217 number = eighth_points * 8;
219 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
224#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
225#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
227#if LV_HAVE_AVX2 && LV_HAVE_FMA
228#include <immintrin.h>
230static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(
float* outputVector,
232 const float normalizeFactor,
233 unsigned int num_points)
235 const float* in = (
float*)complexVector;
236 float* out = (
float*)outputVector;
238 const float invNormalizeFactor = 1.f / normalizeFactor;
239 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
240 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
241 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
242 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
243 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
244 const __m256 zero = _mm256_setzero_ps();
246 unsigned int number = 0;
247 unsigned int eighth_points = num_points / 8;
248 for (; number < eighth_points; number++) {
249 __m256 z1 = _mm256_loadu_ps(in);
251 __m256 z2 = _mm256_loadu_ps(in);
257 __m256 swap_mask = _mm256_cmp_ps(
258 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
259 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
260 _mm256_blendv_ps(x, y, swap_mask));
261 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
262 input = _mm256_blendv_ps(input, zero, nan_mask);
266 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
267 result = _mm256_blendv_ps(result, input, swap_mask);
270 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
272 result = _mm256_add_ps(
273 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
275 result = _mm256_mul_ps(result, vinvNormalizeFactor);
277 _mm256_storeu_ps(out, result);
281 number = eighth_points * 8;
283 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
288#include <immintrin.h>
290static inline void volk_32fc_s32f_atan2_32f_u_avx2(
float* outputVector,
292 const float normalizeFactor,
293 unsigned int num_points)
295 const float* in = (
float*)complexVector;
296 float* out = (
float*)outputVector;
298 const float invNormalizeFactor = 1.f / normalizeFactor;
299 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
300 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
301 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
302 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
303 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
304 const __m256 zero = _mm256_setzero_ps();
306 unsigned int number = 0;
307 unsigned int eighth_points = num_points / 8;
308 for (; number < eighth_points; number++) {
309 __m256 z1 = _mm256_loadu_ps(in);
311 __m256 z2 = _mm256_loadu_ps(in);
317 __m256 swap_mask = _mm256_cmp_ps(
318 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
319 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
320 _mm256_blendv_ps(x, y, swap_mask));
321 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
322 input = _mm256_blendv_ps(input, zero, nan_mask);
326 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
327 result = _mm256_blendv_ps(result, input, swap_mask);
330 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
332 result = _mm256_add_ps(
333 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
335 result = _mm256_mul_ps(result, vinvNormalizeFactor);
337 _mm256_storeu_ps(out, result);
341 number = eighth_points * 8;
343 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
348#include <riscv_vector.h>
351static inline void volk_32fc_s32f_atan2_32f_rvv(
float* outputVector,
353 const float normalizeFactor,
354 unsigned int num_points)
356 size_t vlmax = __riscv_vsetvlmax_e32m2();
358 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
359 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
360 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
361 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
362 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
363 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
364 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
365 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
366 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
367 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
369 size_t n = num_points;
370 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
371 vl = __riscv_vsetvl_e32m2(n);
372 vuint64m4_t v = __riscv_vle64_v_u64m4((
const uint64_t*)inputVector, vl);
373 vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
374 vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
375 vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
376 vfloat32m2_t x = __riscv_vfdiv(
377 __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
378 vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
379 x = __riscv_vreinterpret_f32m2(
380 __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
382 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
383 vfloat32m2_t p = c13;
384 p = __riscv_vfmadd(p, xx, c11, vl);
385 p = __riscv_vfmadd(p, xx, c9, vl);
386 p = __riscv_vfmadd(p, xx, c7, vl);
387 p = __riscv_vfmadd(p, xx, c5, vl);
388 p = __riscv_vfmadd(p, xx, c3, vl);
389 p = __riscv_vfmadd(p, xx, c1, vl);
390 p = __riscv_vfmul(p, x, vl);
392 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
393 p = __riscv_vmerge(p, x, mswap, vl);
394 p = __riscv_vfadd_mu(
395 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
397 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
403#include <riscv_vector.h>
406static inline void volk_32fc_s32f_atan2_32f_rvvseg(
float* outputVector,
408 const float normalizeFactor,
409 unsigned int num_points)
411 size_t vlmax = __riscv_vsetvlmax_e32m2();
413 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
414 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
415 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
416 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
417 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
418 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
419 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
420 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
421 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
422 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
424 size_t n = num_points;
425 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
426 vl = __riscv_vsetvl_e32m2(n);
427 vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((
const float*)inputVector, vl);
428 vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
429 vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(vi, vl), __riscv_vfabs(vr, vl), vl);
430 vfloat32m2_t x = __riscv_vfdiv(
431 __riscv_vmerge(vi, vr, mswap, vl), __riscv_vmerge(vr, vi, mswap, vl), vl);
432 vbool16_t mnan = __riscv_vmsgtu(__riscv_vfclass(x, vl), 0xFF, vl);
433 x = __riscv_vreinterpret_f32m2(
434 __riscv_vmerge(__riscv_vreinterpret_u32m2(x), 0, mnan, vl));
436 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
437 vfloat32m2_t p = c13;
438 p = __riscv_vfmadd(p, xx, c11, vl);
439 p = __riscv_vfmadd(p, xx, c9, vl);
440 p = __riscv_vfmadd(p, xx, c7, vl);
441 p = __riscv_vfmadd(p, xx, c5, vl);
442 p = __riscv_vfmadd(p, xx, c3, vl);
443 p = __riscv_vfmadd(p, xx, c1, vl);
444 p = __riscv_vfmul(p, x, vl);
446 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
447 p = __riscv_vmerge(p, x, mswap, vl);
448 p = __riscv_vfadd_mu(
449 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
451 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
static void volk_32fc_s32f_atan2_32f_polynomial(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition volk_32fc_s32f_atan2_32f.h:86
static void volk_32fc_s32f_atan2_32f_generic(float *outputVector, const lv_32fc_t *inputVector, const float normalizeFactor, unsigned int num_points)
Definition volk_32fc_s32f_atan2_32f.h:67
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition volk_avx2_fma_intrinsics.h:26
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition volk_avx2_intrinsics.h:21
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition volk_avx2_intrinsics.h:28
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static float volk_atan2(const float y, const float x)
Definition volk_common.h:215
float complex lv_32fc_t
Definition volk_complex.h:74
#define RISCV_VMFLTZ(T, v, vl)
Definition volk_rvv_intrinsics.h:75