Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_acos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#include <inttypes.h>
58#include <math.h>
59#include <stdio.h>
60
61/* This is the number of terms of Taylor series to evaluate, increase this for more
62 * accuracy*/
63#define ACOS_TERMS 2
64
65#ifndef INCLUDED_volk_32f_acos_32f_a_H
66#define INCLUDED_volk_32f_acos_32f_a_H
67
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
69#include <immintrin.h>
70
71static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
72 const float* aVector,
73 unsigned int num_points)
74{
75 float* bPtr = bVector;
76 const float* aPtr = aVector;
77
78 unsigned int number = 0;
79 unsigned int eighthPoints = num_points / 8;
80 int i, j;
81
82 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
83 __m256 fzeroes, fones, ftwos, ffours, condition;
84
85 pi = _mm256_set1_ps(3.14159265358979323846);
86 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
87 fzeroes = _mm256_setzero_ps();
88 fones = _mm256_set1_ps(1.0);
89 ftwos = _mm256_set1_ps(2.0);
90 ffours = _mm256_set1_ps(4.0);
91
92 for (; number < eighthPoints; number++) {
93 aVal = _mm256_load_ps(aPtr);
94 d = aVal;
95 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
96 _mm256_sub_ps(fones, aVal))),
97 aVal);
98 z = aVal;
99 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
100 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
101 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
102 x = _mm256_add_ps(
103 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
104
105 for (i = 0; i < 2; i++) {
106 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
107 }
108 x = _mm256_div_ps(fones, x);
109 y = fzeroes;
110 for (j = ACOS_TERMS - 1; j >= 0; j--) {
111 y = _mm256_fmadd_ps(
112 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
113 }
114
115 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
116 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
117
118 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
119 arccosine = y;
120 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
121 arccosine = _mm256_sub_ps(
122 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
123 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
124 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
125
126 _mm256_store_ps(bPtr, arccosine);
127 aPtr += 8;
128 bPtr += 8;
129 }
130
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 *bPtr++ = acos(*aPtr++);
134 }
135}
136
137#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
138
139
140#ifdef LV_HAVE_AVX
141#include <immintrin.h>
142
143static inline void
144volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
145{
146 float* bPtr = bVector;
147 const float* aPtr = aVector;
148
149 unsigned int number = 0;
150 unsigned int eighthPoints = num_points / 8;
151 int i, j;
152
153 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
154 __m256 fzeroes, fones, ftwos, ffours, condition;
155
156 pi = _mm256_set1_ps(3.14159265358979323846);
157 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
158 fzeroes = _mm256_setzero_ps();
159 fones = _mm256_set1_ps(1.0);
160 ftwos = _mm256_set1_ps(2.0);
161 ffours = _mm256_set1_ps(4.0);
162
163 for (; number < eighthPoints; number++) {
164 aVal = _mm256_load_ps(aPtr);
165 d = aVal;
166 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
167 _mm256_sub_ps(fones, aVal))),
168 aVal);
169 z = aVal;
170 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
171 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
172 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
173 x = _mm256_add_ps(
174 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
175
176 for (i = 0; i < 2; i++) {
177 x = _mm256_add_ps(x,
178 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
179 }
180 x = _mm256_div_ps(fones, x);
181 y = fzeroes;
182 for (j = ACOS_TERMS - 1; j >= 0; j--) {
183 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
184 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
185 }
186
187 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
188 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
189
190 y = _mm256_add_ps(
191 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
192 arccosine = y;
193 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
194 arccosine = _mm256_sub_ps(
195 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
196 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
197 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
198
199 _mm256_store_ps(bPtr, arccosine);
200 aPtr += 8;
201 bPtr += 8;
202 }
203
204 number = eighthPoints * 8;
205 for (; number < num_points; number++) {
206 *bPtr++ = acos(*aPtr++);
207 }
208}
209
210#endif /* LV_HAVE_AVX2 for aligned */
211
212#ifdef LV_HAVE_SSE4_1
213#include <smmintrin.h>
214
215static inline void
216volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
217{
218 float* bPtr = bVector;
219 const float* aPtr = aVector;
220
221 unsigned int number = 0;
222 unsigned int quarterPoints = num_points / 4;
223 int i, j;
224
225 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
226 __m128 fzeroes, fones, ftwos, ffours, condition;
227
228 pi = _mm_set1_ps(3.14159265358979323846);
229 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
230 fzeroes = _mm_setzero_ps();
231 fones = _mm_set1_ps(1.0);
232 ftwos = _mm_set1_ps(2.0);
233 ffours = _mm_set1_ps(4.0);
234
235 for (; number < quarterPoints; number++) {
236 aVal = _mm_load_ps(aPtr);
237 d = aVal;
238 aVal = _mm_div_ps(
239 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
240 aVal);
241 z = aVal;
242 condition = _mm_cmplt_ps(z, fzeroes);
243 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
244 condition = _mm_cmplt_ps(z, fones);
245 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
246
247 for (i = 0; i < 2; i++) {
248 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
249 }
250 x = _mm_div_ps(fones, x);
251 y = fzeroes;
252 for (j = ACOS_TERMS - 1; j >= 0; j--) {
253 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
254 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
255 }
256
257 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
258 condition = _mm_cmpgt_ps(z, fones);
259
260 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
261 arccosine = y;
262 condition = _mm_cmplt_ps(aVal, fzeroes);
263 arccosine =
264 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
265 condition = _mm_cmplt_ps(d, fzeroes);
266 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
267
268 _mm_store_ps(bPtr, arccosine);
269 aPtr += 4;
270 bPtr += 4;
271 }
272
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = acosf(*aPtr++);
276 }
277}
278
279#endif /* LV_HAVE_SSE4_1 for aligned */
280
281#endif /* INCLUDED_volk_32f_acos_32f_a_H */
282
283
284#ifndef INCLUDED_volk_32f_acos_32f_u_H
285#define INCLUDED_volk_32f_acos_32f_u_H
286
287#if LV_HAVE_AVX2 && LV_HAVE_FMA
288#include <immintrin.h>
289
290static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
291 const float* aVector,
292 unsigned int num_points)
293{
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
296
297 unsigned int number = 0;
298 unsigned int eighthPoints = num_points / 8;
299 int i, j;
300
301 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
302 __m256 fzeroes, fones, ftwos, ffours, condition;
303
304 pi = _mm256_set1_ps(3.14159265358979323846);
305 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
306 fzeroes = _mm256_setzero_ps();
307 fones = _mm256_set1_ps(1.0);
308 ftwos = _mm256_set1_ps(2.0);
309 ffours = _mm256_set1_ps(4.0);
310
311 for (; number < eighthPoints; number++) {
312 aVal = _mm256_loadu_ps(aPtr);
313 d = aVal;
314 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
315 _mm256_sub_ps(fones, aVal))),
316 aVal);
317 z = aVal;
318 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
319 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
320 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
321 x = _mm256_add_ps(
322 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
323
324 for (i = 0; i < 2; i++) {
325 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
326 }
327 x = _mm256_div_ps(fones, x);
328 y = fzeroes;
329 for (j = ACOS_TERMS - 1; j >= 0; j--) {
330 y = _mm256_fmadd_ps(
331 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
332 }
333
334 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
335 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
336
337 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
338 arccosine = y;
339 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
340 arccosine = _mm256_sub_ps(
341 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
342 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
343 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
344
345 _mm256_storeu_ps(bPtr, arccosine);
346 aPtr += 8;
347 bPtr += 8;
348 }
349
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *bPtr++ = acos(*aPtr++);
353 }
354}
355
356#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
357
358
359#ifdef LV_HAVE_AVX
360#include <immintrin.h>
361
362static inline void
363volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
364{
365 float* bPtr = bVector;
366 const float* aPtr = aVector;
367
368 unsigned int number = 0;
369 unsigned int eighthPoints = num_points / 8;
370 int i, j;
371
372 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
373 __m256 fzeroes, fones, ftwos, ffours, condition;
374
375 pi = _mm256_set1_ps(3.14159265358979323846);
376 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
377 fzeroes = _mm256_setzero_ps();
378 fones = _mm256_set1_ps(1.0);
379 ftwos = _mm256_set1_ps(2.0);
380 ffours = _mm256_set1_ps(4.0);
381
382 for (; number < eighthPoints; number++) {
383 aVal = _mm256_loadu_ps(aPtr);
384 d = aVal;
385 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
386 _mm256_sub_ps(fones, aVal))),
387 aVal);
388 z = aVal;
389 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
390 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
391 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
392 x = _mm256_add_ps(
393 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
394
395 for (i = 0; i < 2; i++) {
396 x = _mm256_add_ps(x,
397 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
398 }
399 x = _mm256_div_ps(fones, x);
400 y = fzeroes;
401 for (j = ACOS_TERMS - 1; j >= 0; j--) {
402 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
403 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
404 }
405
406 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
407 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
408
409 y = _mm256_add_ps(
410 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
411 arccosine = y;
412 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
413 arccosine = _mm256_sub_ps(
414 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
415 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
416 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
417
418 _mm256_storeu_ps(bPtr, arccosine);
419 aPtr += 8;
420 bPtr += 8;
421 }
422
423 number = eighthPoints * 8;
424 for (; number < num_points; number++) {
425 *bPtr++ = acos(*aPtr++);
426 }
427}
428
429#endif /* LV_HAVE_AVX2 for unaligned */
430
431#ifdef LV_HAVE_SSE4_1
432#include <smmintrin.h>
433
434static inline void
435volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
436{
437 float* bPtr = bVector;
438 const float* aPtr = aVector;
439
440 unsigned int number = 0;
441 unsigned int quarterPoints = num_points / 4;
442 int i, j;
443
444 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
445 __m128 fzeroes, fones, ftwos, ffours, condition;
446
447 pi = _mm_set1_ps(3.14159265358979323846);
448 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
449 fzeroes = _mm_setzero_ps();
450 fones = _mm_set1_ps(1.0);
451 ftwos = _mm_set1_ps(2.0);
452 ffours = _mm_set1_ps(4.0);
453
454 for (; number < quarterPoints; number++) {
455 aVal = _mm_loadu_ps(aPtr);
456 d = aVal;
457 aVal = _mm_div_ps(
458 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
459 aVal);
460 z = aVal;
461 condition = _mm_cmplt_ps(z, fzeroes);
462 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
463 condition = _mm_cmplt_ps(z, fones);
464 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
465
466 for (i = 0; i < 2; i++) {
467 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
468 }
469 x = _mm_div_ps(fones, x);
470 y = fzeroes;
471
472 for (j = ACOS_TERMS - 1; j >= 0; j--) {
473 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
474 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
475 }
476
477 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
478 condition = _mm_cmpgt_ps(z, fones);
479
480 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
481 arccosine = y;
482 condition = _mm_cmplt_ps(aVal, fzeroes);
483 arccosine =
484 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
485 condition = _mm_cmplt_ps(d, fzeroes);
486 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
487
488 _mm_storeu_ps(bPtr, arccosine);
489 aPtr += 4;
490 bPtr += 4;
491 }
492
493 number = quarterPoints * 4;
494 for (; number < num_points; number++) {
495 *bPtr++ = acosf(*aPtr++);
496 }
497}
498
499#endif /* LV_HAVE_SSE4_1 for aligned */
500
501#ifdef LV_HAVE_GENERIC
502
503static inline void
504volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
505{
506 float* bPtr = bVector;
507 const float* aPtr = aVector;
508 unsigned int number = 0;
509
510 for (number = 0; number < num_points; number++) {
511 *bPtr++ = acosf(*aPtr++);
512 }
513}
514#endif /* LV_HAVE_GENERIC */
515
516#ifdef LV_HAVE_RVV
517#include <riscv_vector.h>
519
520static inline void
521volk_32f_acos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
522{
523 size_t vlmax = __riscv_vsetvlmax_e32m2();
524
525 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
526 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
527 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
528 const vfloat32m2_t cf2 = __riscv_vfmv_v_f_f32m2(2.0f, vlmax);
529 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
530
531#if ACOS_TERMS == 2
532 const vfloat32m2_t cfm1o3 = __riscv_vfmv_v_f_f32m2(-1 / 3.0f, vlmax);
533#elif ACOS_TERMS == 3
534 const vfloat32m2_t cf1o5 = __riscv_vfmv_v_f_f32m2(1 / 5.0f, vlmax);
535#elif ACOS_TERMS == 4
536 const vfloat32m2_t cfm1o7 = __riscv_vfmv_v_f_f32m2(-1 / 7.0f, vlmax);
537#endif
538
539 size_t n = num_points;
540 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
541 vl = __riscv_vsetvl_e32m2(n);
542 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
543 vfloat32m2_t a =
544 __riscv_vfdiv(__riscv_vfsqrt(__riscv_vfmsac(cf1, v, v, vl), vl), v, vl);
545 vfloat32m2_t z = __riscv_vfabs(a, vl);
546 vfloat32m2_t x = __riscv_vfdiv_mu(__riscv_vmflt(z, cf1, vl), z, cf1, z, vl);
547 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
548 x = __riscv_vfadd(x, __riscv_vfsqrt(__riscv_vfmadd(x, x, cf1, vl), vl), vl);
549 x = __riscv_vfdiv(cf1, x, vl);
550 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
551
552#if ACOS_TERMS < 1
553 vfloat32m2_t y = __riscv_vfmv_v_f_f32m2(0, vl);
554#elif ACOS_TERMS == 1
555 y = __riscv_vfmadd(y, xx, cf1, vl);
556#elif ACOS_TERMS == 2
557 vfloat32m2_t y = cfm1o3;
558 y = __riscv_vfmadd(y, xx, cf1, vl);
559#elif ACOS_TERMS == 3
560 vfloat32m2_t y = cf1o5;
561 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
562 y = __riscv_vfmadd(y, xx, cf1, vl);
563#elif ACOS_TERMS == 4
564 vfloat32m2_t y = cfm1o7;
565 y = __riscv_vfmadd(y, xx, cf1o5, vl);
566 y = __riscv_vfmadd(y, xx, cfm1o3, vl);
567 y = __riscv_vfmadd(y, xx, cf1, vl);
568#else
569#error "ACOS_TERMS > 4 not supported by volk_32f_acos_32f_rvv"
570#endif
571 y = __riscv_vfmul(y, __riscv_vfmul(x, cf4, vl), vl);
572 y = __riscv_vfadd_mu(
573 __riscv_vmfgt(z, cf1, vl), y, y, __riscv_vfnmsub(y, cf2, cpio2, vl), vl);
574
575 vfloat32m2_t acosine;
576 acosine = __riscv_vfneg_mu(RISCV_VMFLTZ(32m2, a, vl), y, y, vl);
577 acosine = __riscv_vfadd_mu(RISCV_VMFLTZ(32m2, v, vl), acosine, acosine, cpi, vl);
578
579 __riscv_vse32(bVector, acosine, vl);
580 }
581}
582#endif /*LV_HAVE_RVV*/
583
584#endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:504
#define ACOS_TERMS
Definition volk_32f_acos_32f.h:63
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:363
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_acos_32f.h:144
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_VMFLTZ(T, v, vl)
Definition volk_rvv_intrinsics.h:75