65#ifndef INCLUDED_volk_32f_acos_32f_a_H
66#define INCLUDED_volk_32f_acos_32f_a_H
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
71static inline void volk_32f_acos_32f_a_avx2_fma(
float* bVector,
73 unsigned int num_points)
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
85 pi = _mm256_set1_ps(3.14159265358979323846);
86 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 fzeroes = _mm256_setzero_ps();
88 fones = _mm256_set1_ps(1.0);
89 ftwos = _mm256_set1_ps(2.0);
90 ffours = _mm256_set1_ps(4.0);
92 for (; number < eighthPoints; number++) {
93 aVal = _mm256_load_ps(aPtr);
95 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
99 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
105 for (
i = 0;
i < 2;
i++) {
106 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
108 x = _mm256_div_ps(fones, x);
112 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
115 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
116 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
118 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
120 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
121 arccosine = _mm256_sub_ps(
122 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
123 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
124 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
126 _mm256_store_ps(bPtr, arccosine);
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 *bPtr++ = acos(*aPtr++);
141#include <immintrin.h>
146 float* bPtr = bVector;
147 const float* aPtr = aVector;
149 unsigned int number = 0;
150 unsigned int eighthPoints = num_points / 8;
153 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
154 __m256 fzeroes, fones, ftwos, ffours, condition;
156 pi = _mm256_set1_ps(3.14159265358979323846);
157 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
158 fzeroes = _mm256_setzero_ps();
159 fones = _mm256_set1_ps(1.0);
160 ftwos = _mm256_set1_ps(2.0);
161 ffours = _mm256_set1_ps(4.0);
163 for (; number < eighthPoints; number++) {
164 aVal = _mm256_load_ps(aPtr);
166 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
167 _mm256_sub_ps(fones, aVal))),
170 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
171 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
172 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
174 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
176 for (
i = 0;
i < 2;
i++) {
178 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
180 x = _mm256_div_ps(fones, x);
183 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
184 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
187 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
188 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
191 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
193 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
194 arccosine = _mm256_sub_ps(
195 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
196 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
197 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
199 _mm256_store_ps(bPtr, arccosine);
204 number = eighthPoints * 8;
205 for (; number < num_points; number++) {
206 *bPtr++ = acos(*aPtr++);
213#include <smmintrin.h>
216volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
218 float* bPtr = bVector;
219 const float* aPtr = aVector;
221 unsigned int number = 0;
222 unsigned int quarterPoints = num_points / 4;
225 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
226 __m128 fzeroes, fones, ftwos, ffours, condition;
228 pi = _mm_set1_ps(3.14159265358979323846);
229 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
230 fzeroes = _mm_setzero_ps();
231 fones = _mm_set1_ps(1.0);
232 ftwos = _mm_set1_ps(2.0);
233 ffours = _mm_set1_ps(4.0);
235 for (; number < quarterPoints; number++) {
236 aVal = _mm_load_ps(aPtr);
239 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
242 condition = _mm_cmplt_ps(z, fzeroes);
243 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
244 condition = _mm_cmplt_ps(z, fones);
245 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
247 for (
i = 0;
i < 2;
i++) {
248 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
250 x = _mm_div_ps(fones, x);
253 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
254 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
257 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
258 condition = _mm_cmpgt_ps(z, fones);
260 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
262 condition = _mm_cmplt_ps(aVal, fzeroes);
264 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
265 condition = _mm_cmplt_ps(d, fzeroes);
266 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
268 _mm_store_ps(bPtr, arccosine);
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = acosf(*aPtr++);
284#ifndef INCLUDED_volk_32f_acos_32f_u_H
285#define INCLUDED_volk_32f_acos_32f_u_H
287#if LV_HAVE_AVX2 && LV_HAVE_FMA
288#include <immintrin.h>
290static inline void volk_32f_acos_32f_u_avx2_fma(
float* bVector,
291 const float* aVector,
292 unsigned int num_points)
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
297 unsigned int number = 0;
298 unsigned int eighthPoints = num_points / 8;
301 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
302 __m256 fzeroes, fones, ftwos, ffours, condition;
304 pi = _mm256_set1_ps(3.14159265358979323846);
305 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
306 fzeroes = _mm256_setzero_ps();
307 fones = _mm256_set1_ps(1.0);
308 ftwos = _mm256_set1_ps(2.0);
309 ffours = _mm256_set1_ps(4.0);
311 for (; number < eighthPoints; number++) {
312 aVal = _mm256_loadu_ps(aPtr);
314 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
315 _mm256_sub_ps(fones, aVal))),
318 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
319 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
320 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
322 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
324 for (
i = 0;
i < 2;
i++) {
325 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
327 x = _mm256_div_ps(fones, x);
331 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
334 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
335 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
337 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
339 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
340 arccosine = _mm256_sub_ps(
341 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
342 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
343 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
345 _mm256_storeu_ps(bPtr, arccosine);
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *bPtr++ = acos(*aPtr++);
360#include <immintrin.h>
365 float* bPtr = bVector;
366 const float* aPtr = aVector;
368 unsigned int number = 0;
369 unsigned int eighthPoints = num_points / 8;
372 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
373 __m256 fzeroes, fones, ftwos, ffours, condition;
375 pi = _mm256_set1_ps(3.14159265358979323846);
376 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
377 fzeroes = _mm256_setzero_ps();
378 fones = _mm256_set1_ps(1.0);
379 ftwos = _mm256_set1_ps(2.0);
380 ffours = _mm256_set1_ps(4.0);
382 for (; number < eighthPoints; number++) {
383 aVal = _mm256_loadu_ps(aPtr);
385 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
386 _mm256_sub_ps(fones, aVal))),
389 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
390 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
391 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
393 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
395 for (
i = 0;
i < 2;
i++) {
397 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
399 x = _mm256_div_ps(fones, x);
402 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
403 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
406 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
407 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
410 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
412 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
413 arccosine = _mm256_sub_ps(
414 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
415 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
416 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
418 _mm256_storeu_ps(bPtr, arccosine);
423 number = eighthPoints * 8;
424 for (; number < num_points; number++) {
425 *bPtr++ = acos(*aPtr++);
432#include <smmintrin.h>
435volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
437 float* bPtr = bVector;
438 const float* aPtr = aVector;
440 unsigned int number = 0;
441 unsigned int quarterPoints = num_points / 4;
444 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
445 __m128 fzeroes, fones, ftwos, ffours, condition;
447 pi = _mm_set1_ps(3.14159265358979323846);
448 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
449 fzeroes = _mm_setzero_ps();
450 fones = _mm_set1_ps(1.0);
451 ftwos = _mm_set1_ps(2.0);
452 ffours = _mm_set1_ps(4.0);
454 for (; number < quarterPoints; number++) {
455 aVal = _mm_loadu_ps(aPtr);
458 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
461 condition = _mm_cmplt_ps(z, fzeroes);
462 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
463 condition = _mm_cmplt_ps(z, fones);
464 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
466 for (
i = 0;
i < 2;
i++) {
467 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
469 x = _mm_div_ps(fones, x);
473 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
474 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
477 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
478 condition = _mm_cmpgt_ps(z, fones);
480 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
482 condition = _mm_cmplt_ps(aVal, fzeroes);
484 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
485 condition = _mm_cmplt_ps(d, fzeroes);
486 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
488 _mm_storeu_ps(bPtr, arccosine);
493 number = quarterPoints * 4;
494 for (; number < num_points; number++) {
495 *bPtr++ = acosf(*aPtr++);
501#ifdef LV_HAVE_GENERIC
506 float* bPtr = bVector;
507 const float* aPtr = aVector;
508 unsigned int number = 0;
510 for (number = 0; number < num_points; number++) {
511 *bPtr++ = acosf(*aPtr++);
517#include <riscv_vector.h>
521volk_32f_acos_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
523 size_t vlmax = __riscv_vsetvlmax_e32m2();
525 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
526 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
527 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
528 const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
529 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
532 const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
534 const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
536 const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
539 size_t n = num_points;
540 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
541 vl = __riscv_vsetvl_e32m2(n);
542 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
544 __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
545 vfloat32m2_t z = __riscv_vfabs(a, vl);
546 vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
547 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
548 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
549 x = __riscv_vfdiv(cf1, x, vl);
550 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
553 vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
555 y = __riscv_vfmadd(y, xx, cf1, vl);
557 vfloat32m2_t y = cfm1o3;
558 y = __riscv_vfmadd(y, xx, cf1, vl);
560 vfloat32m2_t y = cf1o5;
561 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
562 y = __riscv_vfmadd(y, xx, cf1, vl);
564 vfloat32m2_t y = cfm1o7;
565 y = __riscv_vfmadd(y, xx, cf1o5, vl);
566 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
567 y = __riscv_vfmadd(y, xx, cf1, vl);
569#error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv"
571 y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
572 y = __riscv_vfadd_mu(
573 __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
575 vfloat32m2_t acosine;
576 acosine = __riscv_vfneg_mu(
RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
577 acosine = __riscv_vfadd_mu(
RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl);
579 __riscv_vse32(bVector, acosine, vl);
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:504
#define ACOS_TERMS
Definition volk_32f_acos_32f.h:63
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:363
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:144
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_VMFLTZ(T, v, vl)
Definition volk_rvv_intrinsics.h:75