65#ifndef INCLUDED_volk_32f_tan_32f_a_H
66#define INCLUDED_volk_32f_tan_32f_a_H
68#if LV_HAVE_AVX2 && LV_HAVE_FMA
72volk_32f_tan_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
74 float* bPtr = bVector;
75 const float* aPtr = aVector;
77 unsigned int number = 0;
78 unsigned int eighthPoints = num_points / 8;
81 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
83 __m256 sine, cosine, tangent, condition1, condition2, condition3;
84 __m256i q, r, ones, twos, fours;
86 m4pi = _mm256_set1_ps(1.273239545);
87 pio4A = _mm256_set1_ps(0.78515625);
88 pio4B = _mm256_set1_ps(0.241876e-3);
89 ffours = _mm256_set1_ps(4.0);
90 ftwos = _mm256_set1_ps(2.0);
91 fones = _mm256_set1_ps(1.0);
92 fzeroes = _mm256_setzero_ps();
93 ones = _mm256_set1_epi32(1);
94 twos = _mm256_set1_epi32(2);
95 fours = _mm256_set1_epi32(4);
97 cp1 = _mm256_set1_ps(1.0);
98 cp2 = _mm256_set1_ps(0.83333333e-1);
99 cp3 = _mm256_set1_ps(0.2777778e-2);
100 cp4 = _mm256_set1_ps(0.49603e-4);
101 cp5 = _mm256_set1_ps(0.551e-6);
103 for (; number < eighthPoints; number++) {
104 aVal = _mm256_load_ps(aPtr);
105 s = _mm256_sub_ps(aVal,
106 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
107 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
108 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
109 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
111 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
112 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
116 _mm256_set1_ps(8.0));
117 s = _mm256_mul_ps(s, s);
122 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
127 for (
i = 0;
i < 3;
i++) {
128 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
130 s = _mm256_div_ps(s, ftwos);
132 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
133 cosine = _mm256_sub_ps(fones, s);
135 condition1 = _mm256_cmp_ps(
136 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
139 condition2 = _mm256_cmp_ps(
141 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
142 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
144 condition3 = _mm256_cmp_ps(
145 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
149 __m256 temp = cosine;
151 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
152 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
153 sine = _mm256_sub_ps(
154 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
155 cosine = _mm256_sub_ps(
157 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
158 tangent = _mm256_div_ps(sine, cosine);
159 _mm256_store_ps(bPtr, tangent);
164 number = eighthPoints * 8;
165 for (; number < num_points; number++) {
166 *bPtr++ = tan(*aPtr++);
173#include <immintrin.h>
176volk_32f_tan_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
178 float* bPtr = bVector;
179 const float* aPtr = aVector;
181 unsigned int number = 0;
182 unsigned int eighthPoints = num_points / 8;
185 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
187 __m256 sine, cosine, tangent, condition1, condition2, condition3;
188 __m256i q, r, ones, twos, fours;
190 m4pi = _mm256_set1_ps(1.273239545);
191 pio4A = _mm256_set1_ps(0.78515625);
192 pio4B = _mm256_set1_ps(0.241876e-3);
193 ffours = _mm256_set1_ps(4.0);
194 ftwos = _mm256_set1_ps(2.0);
195 fones = _mm256_set1_ps(1.0);
196 fzeroes = _mm256_setzero_ps();
197 ones = _mm256_set1_epi32(1);
198 twos = _mm256_set1_epi32(2);
199 fours = _mm256_set1_epi32(4);
201 cp1 = _mm256_set1_ps(1.0);
202 cp2 = _mm256_set1_ps(0.83333333e-1);
203 cp3 = _mm256_set1_ps(0.2777778e-2);
204 cp4 = _mm256_set1_ps(0.49603e-4);
205 cp5 = _mm256_set1_ps(0.551e-6);
207 for (; number < eighthPoints; number++) {
208 aVal = _mm256_load_ps(aPtr);
209 s = _mm256_sub_ps(aVal,
210 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
211 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
212 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
215 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
216 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
220 _mm256_set1_ps(8.0));
221 s = _mm256_mul_ps(s, s);
229 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
238 for (
i = 0;
i < 3;
i++) {
239 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
241 s = _mm256_div_ps(s, ftwos);
243 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
244 cosine = _mm256_sub_ps(fones, s);
246 condition1 = _mm256_cmp_ps(
247 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
250 condition2 = _mm256_cmp_ps(
252 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
253 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
255 condition3 = _mm256_cmp_ps(
256 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
260 __m256 temp = cosine;
262 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
263 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
264 sine = _mm256_sub_ps(
265 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
266 cosine = _mm256_sub_ps(
268 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
269 tangent = _mm256_div_ps(sine, cosine);
270 _mm256_store_ps(bPtr, tangent);
275 number = eighthPoints * 8;
276 for (; number < num_points; number++) {
277 *bPtr++ = tan(*aPtr++);
284#include <smmintrin.h>
287volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
289 float* bPtr = bVector;
290 const float* aPtr = aVector;
292 unsigned int number = 0;
293 unsigned int quarterPoints = num_points / 4;
296 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
298 __m128 sine, cosine, tangent, condition1, condition2, condition3;
299 __m128i q, r, ones, twos, fours;
301 m4pi = _mm_set1_ps(1.273239545);
302 pio4A = _mm_set1_ps(0.78515625);
303 pio4B = _mm_set1_ps(0.241876e-3);
304 ffours = _mm_set1_ps(4.0);
305 ftwos = _mm_set1_ps(2.0);
306 fones = _mm_set1_ps(1.0);
307 fzeroes = _mm_setzero_ps();
308 ones = _mm_set1_epi32(1);
309 twos = _mm_set1_epi32(2);
310 fours = _mm_set1_epi32(4);
312 cp1 = _mm_set1_ps(1.0);
313 cp2 = _mm_set1_ps(0.83333333e-1);
314 cp3 = _mm_set1_ps(0.2777778e-2);
315 cp4 = _mm_set1_ps(0.49603e-4);
316 cp5 = _mm_set1_ps(0.551e-6);
318 for (; number < quarterPoints; number++) {
319 aVal = _mm_load_ps(aPtr);
321 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
322 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
323 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
325 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
326 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
329 s, _mm_set1_ps(8.0));
330 s = _mm_mul_ps(s, s);
337 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
345 for (
i = 0;
i < 3;
i++) {
346 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
348 s = _mm_div_ps(s, ftwos);
350 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
351 cosine = _mm_sub_ps(fones, s);
353 condition1 = _mm_cmpneq_ps(
354 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
355 condition2 = _mm_cmpneq_ps(
356 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
357 _mm_cmplt_ps(aVal, fzeroes));
358 condition3 = _mm_cmpneq_ps(
359 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
361 __m128 temp = cosine;
362 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
363 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
365 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
367 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
368 tangent = _mm_div_ps(sine, cosine);
369 _mm_store_ps(bPtr, tangent);
374 number = quarterPoints * 4;
375 for (; number < num_points; number++) {
376 *bPtr++ = tanf(*aPtr++);
385#ifndef INCLUDED_volk_32f_tan_32f_u_H
386#define INCLUDED_volk_32f_tan_32f_u_H
388#if LV_HAVE_AVX2 && LV_HAVE_FMA
389#include <immintrin.h>
392volk_32f_tan_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
394 float* bPtr = bVector;
395 const float* aPtr = aVector;
397 unsigned int number = 0;
398 unsigned int eighthPoints = num_points / 8;
401 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
403 __m256 sine, cosine, tangent, condition1, condition2, condition3;
404 __m256i q, r, ones, twos, fours;
406 m4pi = _mm256_set1_ps(1.273239545);
407 pio4A = _mm256_set1_ps(0.78515625);
408 pio4B = _mm256_set1_ps(0.241876e-3);
409 ffours = _mm256_set1_ps(4.0);
410 ftwos = _mm256_set1_ps(2.0);
411 fones = _mm256_set1_ps(1.0);
412 fzeroes = _mm256_setzero_ps();
413 ones = _mm256_set1_epi32(1);
414 twos = _mm256_set1_epi32(2);
415 fours = _mm256_set1_epi32(4);
417 cp1 = _mm256_set1_ps(1.0);
418 cp2 = _mm256_set1_ps(0.83333333e-1);
419 cp3 = _mm256_set1_ps(0.2777778e-2);
420 cp4 = _mm256_set1_ps(0.49603e-4);
421 cp5 = _mm256_set1_ps(0.551e-6);
423 for (; number < eighthPoints; number++) {
424 aVal = _mm256_loadu_ps(aPtr);
425 s = _mm256_sub_ps(aVal,
426 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
427 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
428 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
429 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
431 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
432 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
436 _mm256_set1_ps(8.0));
437 s = _mm256_mul_ps(s, s);
442 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
447 for (
i = 0;
i < 3;
i++) {
448 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
450 s = _mm256_div_ps(s, ftwos);
452 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
453 cosine = _mm256_sub_ps(fones, s);
455 condition1 = _mm256_cmp_ps(
456 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
459 condition2 = _mm256_cmp_ps(
461 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
462 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
464 condition3 = _mm256_cmp_ps(
465 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
469 __m256 temp = cosine;
471 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
472 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
473 sine = _mm256_sub_ps(
474 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
475 cosine = _mm256_sub_ps(
477 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
478 tangent = _mm256_div_ps(sine, cosine);
479 _mm256_storeu_ps(bPtr, tangent);
484 number = eighthPoints * 8;
485 for (; number < num_points; number++) {
486 *bPtr++ = tan(*aPtr++);
493#include <immintrin.h>
496volk_32f_tan_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
498 float* bPtr = bVector;
499 const float* aPtr = aVector;
501 unsigned int number = 0;
502 unsigned int eighthPoints = num_points / 8;
505 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
507 __m256 sine, cosine, tangent, condition1, condition2, condition3;
508 __m256i q, r, ones, twos, fours;
510 m4pi = _mm256_set1_ps(1.273239545);
511 pio4A = _mm256_set1_ps(0.78515625);
512 pio4B = _mm256_set1_ps(0.241876e-3);
513 ffours = _mm256_set1_ps(4.0);
514 ftwos = _mm256_set1_ps(2.0);
515 fones = _mm256_set1_ps(1.0);
516 fzeroes = _mm256_setzero_ps();
517 ones = _mm256_set1_epi32(1);
518 twos = _mm256_set1_epi32(2);
519 fours = _mm256_set1_epi32(4);
521 cp1 = _mm256_set1_ps(1.0);
522 cp2 = _mm256_set1_ps(0.83333333e-1);
523 cp3 = _mm256_set1_ps(0.2777778e-2);
524 cp4 = _mm256_set1_ps(0.49603e-4);
525 cp5 = _mm256_set1_ps(0.551e-6);
527 for (; number < eighthPoints; number++) {
528 aVal = _mm256_loadu_ps(aPtr);
529 s = _mm256_sub_ps(aVal,
530 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
531 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
532 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
533 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
535 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
536 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
540 _mm256_set1_ps(8.0));
541 s = _mm256_mul_ps(s, s);
549 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
558 for (
i = 0;
i < 3;
i++) {
559 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
561 s = _mm256_div_ps(s, ftwos);
563 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
564 cosine = _mm256_sub_ps(fones, s);
566 condition1 = _mm256_cmp_ps(
567 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
570 condition2 = _mm256_cmp_ps(
572 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
573 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
575 condition3 = _mm256_cmp_ps(
576 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
580 __m256 temp = cosine;
582 _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
583 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
584 sine = _mm256_sub_ps(
585 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
586 cosine = _mm256_sub_ps(
588 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
589 tangent = _mm256_div_ps(sine, cosine);
590 _mm256_storeu_ps(bPtr, tangent);
595 number = eighthPoints * 8;
596 for (; number < num_points; number++) {
597 *bPtr++ = tan(*aPtr++);
605#include <smmintrin.h>
608volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
610 float* bPtr = bVector;
611 const float* aPtr = aVector;
613 unsigned int number = 0;
614 unsigned int quarterPoints = num_points / 4;
617 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
619 __m128 sine, cosine, tangent, condition1, condition2, condition3;
620 __m128i q, r, ones, twos, fours;
622 m4pi = _mm_set1_ps(1.273239545);
623 pio4A = _mm_set1_ps(0.78515625);
624 pio4B = _mm_set1_ps(0.241876e-3);
625 ffours = _mm_set1_ps(4.0);
626 ftwos = _mm_set1_ps(2.0);
627 fones = _mm_set1_ps(1.0);
628 fzeroes = _mm_setzero_ps();
629 ones = _mm_set1_epi32(1);
630 twos = _mm_set1_epi32(2);
631 fours = _mm_set1_epi32(4);
633 cp1 = _mm_set1_ps(1.0);
634 cp2 = _mm_set1_ps(0.83333333e-1);
635 cp3 = _mm_set1_ps(0.2777778e-2);
636 cp4 = _mm_set1_ps(0.49603e-4);
637 cp5 = _mm_set1_ps(0.551e-6);
639 for (; number < quarterPoints; number++) {
640 aVal = _mm_loadu_ps(aPtr);
642 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
643 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
644 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
646 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
647 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
650 s, _mm_set1_ps(8.0));
651 s = _mm_mul_ps(s, s);
658 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
666 for (
i = 0;
i < 3;
i++) {
667 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
669 s = _mm_div_ps(s, ftwos);
671 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
672 cosine = _mm_sub_ps(fones, s);
674 condition1 = _mm_cmpneq_ps(
675 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
676 condition2 = _mm_cmpneq_ps(
677 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
678 _mm_cmplt_ps(aVal, fzeroes));
679 condition3 = _mm_cmpneq_ps(
680 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
682 __m128 temp = cosine;
683 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
684 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
686 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
688 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
689 tangent = _mm_div_ps(sine, cosine);
690 _mm_storeu_ps(bPtr, tangent);
695 number = quarterPoints * 4;
696 for (; number < num_points; number++) {
697 *bPtr++ = tanf(*aPtr++);
704#ifdef LV_HAVE_GENERIC
709 float* bPtr = bVector;
710 const float* aPtr = aVector;
711 unsigned int number = 0;
713 for (; number < num_points; number++) {
714 *bPtr++ = tanf(*aPtr++);
727 unsigned int number = 0;
728 unsigned int quarter_points = num_points / 4;
729 float* bVectorPtr = bVector;
730 const float* aVectorPtr = aVector;
735 for (number = 0; number < quarter_points; number++) {
736 a_vec = vld1q_f32(aVectorPtr);
740 vst1q_f32(bVectorPtr, b_vec);
747 for (number = quarter_points * 4; number < num_points; number++) {
748 *bVectorPtr++ = tanf(*aVectorPtr++);
754#include <riscv_vector.h>
757volk_32f_tan_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
759 size_t vlmax = __riscv_vsetvlmax_e32m2();
761 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
762 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
763 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
764 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
766 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
767 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
769 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
770 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
771 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
772 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
774 size_t n = num_points;
775 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
776 vl = __riscv_vsetvl_e32m2(n);
777 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
778 vfloat32m2_t s = __riscv_vfabs(v, vl);
779 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
780 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
782 s = __riscv_vfnmsac(s, cPio4a, r, vl);
783 s = __riscv_vfnmsac(s, cPio4b, r, vl);
784 s = __riscv_vfnmsac(s, cPio4c, r, vl);
786 s = __riscv_vfmul(s, 1 / 8.0f, vl);
787 s = __riscv_vfmul(s, s, vl);
789 s = __riscv_vfmsub(s, c5, c4, vl);
790 s = __riscv_vfmadd(s, t, c3, vl);
791 s = __riscv_vfmsub(s, t, c2, vl);
792 s = __riscv_vfmadd(s, t, cf1, vl);
793 s = __riscv_vfmul(s, t, vl);
794 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
795 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
796 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
797 s = __riscv_vfmul(s, 1 / 2.0f, vl);
800 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
801 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
803 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
804 vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
805 vbool16_t m3 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
806 __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
809 vfloat32m2_t sine0 = sine;
810 sine = __riscv_vmerge(sine, cosine, m1, vl);
811 sine = __riscv_vfneg_mu(m3, sine, sine, vl);
813 cosine = __riscv_vmerge(cosine, sine0, m1, vl);
814 cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
816 __riscv_vse32(bVector, __riscv_vfdiv(sine, cosine, vl), vl);
static void volk_32f_tan_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_tan_32f.h:725
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_tan_32f.h:707
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vtanq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:261