65#ifndef INCLUDED_volk_32f_asin_32f_a_H
66#define INCLUDED_volk_32f_asin_32f_a_H
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
71static inline void volk_32f_asin_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, pio2, x, y, z, arcsine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
86 fzeroes = _mm256_setzero_ps();
87 fones = _mm256_set1_ps(1.0);
88 ftwos = _mm256_set1_ps(2.0);
89 ffours = _mm256_set1_ps(4.0);
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 aVal = _mm256_div_ps(aVal,
94 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
95 _mm256_sub_ps(fones, aVal))));
97 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
98 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
99 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
101 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
103 for (
i = 0;
i < 2;
i++) {
104 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
106 x = _mm256_div_ps(fones, x);
110 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
114 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
116 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
118 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
119 arcsine = _mm256_sub_ps(arcsine,
120 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
122 _mm256_store_ps(bPtr, arcsine);
127 number = eighthPoints * 8;
128 for (; number < num_points; number++) {
129 *bPtr++ = asin(*aPtr++);
137#include <immintrin.h>
142 float* bPtr = bVector;
143 const float* aPtr = aVector;
145 unsigned int number = 0;
146 unsigned int eighthPoints = num_points / 8;
149 __m256 aVal, pio2, x, y, z, arcsine;
150 __m256 fzeroes, fones, ftwos, ffours, condition;
152 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
153 fzeroes = _mm256_setzero_ps();
154 fones = _mm256_set1_ps(1.0);
155 ftwos = _mm256_set1_ps(2.0);
156 ffours = _mm256_set1_ps(4.0);
158 for (; number < eighthPoints; number++) {
159 aVal = _mm256_load_ps(aPtr);
160 aVal = _mm256_div_ps(aVal,
161 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
162 _mm256_sub_ps(fones, aVal))));
164 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
165 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
166 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
168 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
170 for (
i = 0;
i < 2;
i++) {
172 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
174 x = _mm256_div_ps(fones, x);
177 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
178 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
181 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
182 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
185 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
187 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
188 arcsine = _mm256_sub_ps(arcsine,
189 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
191 _mm256_store_ps(bPtr, arcsine);
196 number = eighthPoints * 8;
197 for (; number < num_points; number++) {
198 *bPtr++ = asin(*aPtr++);
205#include <smmintrin.h>
208volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
213 unsigned int number = 0;
214 unsigned int quarterPoints = num_points / 4;
217 __m128 aVal, pio2, x, y, z, arcsine;
218 __m128 fzeroes, fones, ftwos, ffours, condition;
220 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
221 fzeroes = _mm_setzero_ps();
222 fones = _mm_set1_ps(1.0);
223 ftwos = _mm_set1_ps(2.0);
224 ffours = _mm_set1_ps(4.0);
226 for (; number < quarterPoints; number++) {
227 aVal = _mm_load_ps(aPtr);
230 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
232 condition = _mm_cmplt_ps(z, fzeroes);
233 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
234 condition = _mm_cmplt_ps(z, fones);
235 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
237 for (
i = 0;
i < 2;
i++) {
238 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
240 x = _mm_div_ps(fones, x);
243 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
244 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
247 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
248 condition = _mm_cmpgt_ps(z, fones);
250 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
252 condition = _mm_cmplt_ps(aVal, fzeroes);
253 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
255 _mm_store_ps(bPtr, arcsine);
260 number = quarterPoints * 4;
261 for (; number < num_points; number++) {
262 *bPtr++ = asinf(*aPtr++);
270#ifndef INCLUDED_volk_32f_asin_32f_u_H
271#define INCLUDED_volk_32f_asin_32f_u_H
273#if LV_HAVE_AVX2 && LV_HAVE_FMA
274#include <immintrin.h>
276static inline void volk_32f_asin_32f_u_avx2_fma(
float* bVector,
277 const float* aVector,
278 unsigned int num_points)
280 float* bPtr = bVector;
281 const float* aPtr = aVector;
283 unsigned int number = 0;
284 unsigned int eighthPoints = num_points / 8;
287 __m256 aVal, pio2, x, y, z, arcsine;
288 __m256 fzeroes, fones, ftwos, ffours, condition;
290 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
291 fzeroes = _mm256_setzero_ps();
292 fones = _mm256_set1_ps(1.0);
293 ftwos = _mm256_set1_ps(2.0);
294 ffours = _mm256_set1_ps(4.0);
296 for (; number < eighthPoints; number++) {
297 aVal = _mm256_loadu_ps(aPtr);
298 aVal = _mm256_div_ps(aVal,
299 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
300 _mm256_sub_ps(fones, aVal))));
302 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
303 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
304 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
306 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
308 for (
i = 0;
i < 2;
i++) {
309 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
311 x = _mm256_div_ps(fones, x);
315 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
318 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
319 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
321 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
323 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
324 arcsine = _mm256_sub_ps(arcsine,
325 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
327 _mm256_storeu_ps(bPtr, arcsine);
332 number = eighthPoints * 8;
333 for (; number < num_points; number++) {
334 *bPtr++ = asin(*aPtr++);
342#include <immintrin.h>
347 float* bPtr = bVector;
348 const float* aPtr = aVector;
350 unsigned int number = 0;
351 unsigned int eighthPoints = num_points / 8;
354 __m256 aVal, pio2, x, y, z, arcsine;
355 __m256 fzeroes, fones, ftwos, ffours, condition;
357 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
358 fzeroes = _mm256_setzero_ps();
359 fones = _mm256_set1_ps(1.0);
360 ftwos = _mm256_set1_ps(2.0);
361 ffours = _mm256_set1_ps(4.0);
363 for (; number < eighthPoints; number++) {
364 aVal = _mm256_loadu_ps(aPtr);
365 aVal = _mm256_div_ps(aVal,
366 _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
367 _mm256_sub_ps(fones, aVal))));
369 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
370 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
371 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
373 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
375 for (
i = 0;
i < 2;
i++) {
377 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
379 x = _mm256_div_ps(fones, x);
382 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
383 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
386 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
387 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
390 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
392 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
393 arcsine = _mm256_sub_ps(arcsine,
394 _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
396 _mm256_storeu_ps(bPtr, arcsine);
401 number = eighthPoints * 8;
402 for (; number < num_points; number++) {
403 *bPtr++ = asin(*aPtr++);
411#include <smmintrin.h>
414volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
416 float* bPtr = bVector;
417 const float* aPtr = aVector;
419 unsigned int number = 0;
420 unsigned int quarterPoints = num_points / 4;
423 __m128 aVal, pio2, x, y, z, arcsine;
424 __m128 fzeroes, fones, ftwos, ffours, condition;
426 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
427 fzeroes = _mm_setzero_ps();
428 fones = _mm_set1_ps(1.0);
429 ftwos = _mm_set1_ps(2.0);
430 ffours = _mm_set1_ps(4.0);
432 for (; number < quarterPoints; number++) {
433 aVal = _mm_loadu_ps(aPtr);
436 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
438 condition = _mm_cmplt_ps(z, fzeroes);
439 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
440 condition = _mm_cmplt_ps(z, fones);
441 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
443 for (
i = 0;
i < 2;
i++) {
444 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
446 x = _mm_div_ps(fones, x);
449 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
450 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
453 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
454 condition = _mm_cmpgt_ps(z, fones);
456 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
458 condition = _mm_cmplt_ps(aVal, fzeroes);
459 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
461 _mm_storeu_ps(bPtr, arcsine);
466 number = quarterPoints * 4;
467 for (; number < num_points; number++) {
468 *bPtr++ = asinf(*aPtr++);
474#ifdef LV_HAVE_GENERIC
479 float* bPtr = bVector;
480 const float* aPtr = aVector;
481 unsigned int number = 0;
483 for (number = 0; number < num_points; number++) {
484 *bPtr++ = asinf(*aPtr++);
490#include <riscv_vector.h>
494volk_32f_asin_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
496 size_t vlmax = __riscv_vsetvlmax_e32m2();
498 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
499 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
500 const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
501 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
504 const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
506 const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
508 const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
511 size_t n = num_points;
512 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
513 vl = __riscv_vsetvl_e32m2(n);
514 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
516 __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
517 vfloat32m2_t z = __riscv_vfabs(a, vl);
518 vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
519 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
520 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
521 x = __riscv_vfdiv(cf1, x, vl);
522 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
525 vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
527 y = __riscv_vfmadd(y, xx, cf1, vl);
529 vfloat32m2_t y = cfm1o3;
530 y = __riscv_vfmadd(y, xx, cf1, vl);
532 vfloat32m2_t y = cf1o5;
533 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
534 y = __riscv_vfmadd(y, xx, cf1, vl);
536 vfloat32m2_t y = cfm1o7;
537 y = __riscv_vfmadd(y, xx, cf1o5, vl);
538 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
539 y = __riscv_vfmadd(y, xx, cf1, vl);
541#error "ASIN_TERMS > 4 not supported by volk_32f_asin_32f_rvv"
543 y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
544 y = __riscv_vfadd_mu(
545 __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
548 asine = __riscv_vfneg_mu(
RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
550 __riscv_vse32(bVector, asine, vl);
static void volk_32f_asin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_asin_32f.h:477
#define ASIN_TERMS
Definition volk_32f_asin_32f.h:63
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_asin_32f.h:345
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_asin_32f.h:140
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_VMFLTZ(T, v, vl)
Definition volk_rvv_intrinsics.h:75