63#ifndef INCLUDED_volk_32f_sin_32f_a_H
64#define INCLUDED_volk_32f_sin_32f_a_H
68static inline void volk_32f_sin_32f_a_avx512f(
float* sinVector,
69 const float* inVector,
70 unsigned int num_points)
72 float* sinPtr = sinVector;
73 const float* inPtr = inVector;
75 unsigned int number = 0;
76 unsigned int sixteenPoints = num_points / 16;
79 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
82 __m512i q, zeros, ones, twos, fours;
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2, ltZero;
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
119 _mm512_set1_ps(8.0f));
120 s = _mm512_mul_ps(s, s);
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
130 for (
i = 0;
i < 3;
i++) {
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
133 s = _mm512_div_ps(s, ftwos);
135 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136 cosine = _mm512_sub_ps(fones, s);
138 condition1 = _mm512_cmpneq_epi32_mask(
139 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
141 condition2 = _mm512_kxor(
142 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
144 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
145 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
146 _mm512_store_ps(sinPtr, sine);
151 number = sixteenPoints * 16;
152 for (; number < num_points; number++) {
153 *sinPtr++ = sinf(*inPtr++);
157#if LV_HAVE_AVX2 && LV_HAVE_FMA
158#include <immintrin.h>
161volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
163 float* bPtr = bVector;
164 const float* aPtr = aVector;
166 unsigned int number = 0;
167 unsigned int eighthPoints = num_points / 8;
170 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
172 __m256 sine, cosine, condition1, condition2;
173 __m256i q, r, ones, twos, fours;
175 m4pi = _mm256_set1_ps(1.273239545);
176 pio4A = _mm256_set1_ps(0.78515625);
177 pio4B = _mm256_set1_ps(0.241876e-3);
178 ffours = _mm256_set1_ps(4.0);
179 ftwos = _mm256_set1_ps(2.0);
180 fones = _mm256_set1_ps(1.0);
181 fzeroes = _mm256_setzero_ps();
182 ones = _mm256_set1_epi32(1);
183 twos = _mm256_set1_epi32(2);
184 fours = _mm256_set1_epi32(4);
186 cp1 = _mm256_set1_ps(1.0);
187 cp2 = _mm256_set1_ps(0.83333333e-1);
188 cp3 = _mm256_set1_ps(0.2777778e-2);
189 cp4 = _mm256_set1_ps(0.49603e-4);
190 cp5 = _mm256_set1_ps(0.551e-6);
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_load_ps(aPtr);
194 s = _mm256_sub_ps(aVal,
195 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
196 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
197 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
198 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
200 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
201 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
205 _mm256_set1_ps(8.0));
206 s = _mm256_mul_ps(s, s);
211 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
216 for (
i = 0;
i < 3;
i++) {
217 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
219 s = _mm256_div_ps(s, ftwos);
221 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
222 cosine = _mm256_sub_ps(fones, s);
224 condition1 = _mm256_cmp_ps(
225 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
228 condition2 = _mm256_cmp_ps(
230 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
231 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
238 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
239 sine = _mm256_sub_ps(
240 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
241 _mm256_store_ps(bPtr, sine);
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *bPtr++ = sin(*aPtr++);
255#include <immintrin.h>
258volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
260 float* bPtr = bVector;
261 const float* aPtr = aVector;
263 unsigned int number = 0;
264 unsigned int eighthPoints = num_points / 8;
267 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
269 __m256 sine, cosine, condition1, condition2;
270 __m256i q, r, ones, twos, fours;
272 m4pi = _mm256_set1_ps(1.273239545);
273 pio4A = _mm256_set1_ps(0.78515625);
274 pio4B = _mm256_set1_ps(0.241876e-3);
275 ffours = _mm256_set1_ps(4.0);
276 ftwos = _mm256_set1_ps(2.0);
277 fones = _mm256_set1_ps(1.0);
278 fzeroes = _mm256_setzero_ps();
279 ones = _mm256_set1_epi32(1);
280 twos = _mm256_set1_epi32(2);
281 fours = _mm256_set1_epi32(4);
283 cp1 = _mm256_set1_ps(1.0);
284 cp2 = _mm256_set1_ps(0.83333333e-1);
285 cp3 = _mm256_set1_ps(0.2777778e-2);
286 cp4 = _mm256_set1_ps(0.49603e-4);
287 cp5 = _mm256_set1_ps(0.551e-6);
289 for (; number < eighthPoints; number++) {
290 aVal = _mm256_load_ps(aPtr);
291 s = _mm256_sub_ps(aVal,
292 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
293 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
294 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
295 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
297 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
302 _mm256_set1_ps(8.0));
303 s = _mm256_mul_ps(s, s);
311 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
320 for (
i = 0;
i < 3;
i++) {
321 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
323 s = _mm256_div_ps(s, ftwos);
325 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
326 cosine = _mm256_sub_ps(fones, s);
328 condition1 = _mm256_cmp_ps(
329 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
332 condition2 = _mm256_cmp_ps(
334 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
335 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
342 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
343 sine = _mm256_sub_ps(
344 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
345 _mm256_store_ps(bPtr, sine);
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *bPtr++ = sin(*aPtr++);
359#include <smmintrin.h>
362volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
364 float* bPtr = bVector;
365 const float* aPtr = aVector;
367 unsigned int number = 0;
368 unsigned int quarterPoints = num_points / 4;
371 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
373 __m128 sine, cosine, condition1, condition2;
374 __m128i q, r, ones, twos, fours;
376 m4pi = _mm_set1_ps(1.273239545);
377 pio4A = _mm_set1_ps(0.78515625);
378 pio4B = _mm_set1_ps(0.241876e-3);
379 ffours = _mm_set1_ps(4.0);
380 ftwos = _mm_set1_ps(2.0);
381 fones = _mm_set1_ps(1.0);
382 fzeroes = _mm_setzero_ps();
383 ones = _mm_set1_epi32(1);
384 twos = _mm_set1_epi32(2);
385 fours = _mm_set1_epi32(4);
387 cp1 = _mm_set1_ps(1.0);
388 cp2 = _mm_set1_ps(0.83333333e-1);
389 cp3 = _mm_set1_ps(0.2777778e-2);
390 cp4 = _mm_set1_ps(0.49603e-4);
391 cp5 = _mm_set1_ps(0.551e-6);
393 for (; number < quarterPoints; number++) {
394 aVal = _mm_load_ps(aPtr);
396 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
397 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
398 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
400 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
401 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
404 s, _mm_set1_ps(8.0));
405 s = _mm_mul_ps(s, s);
412 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
420 for (
i = 0;
i < 3;
i++) {
421 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
423 s = _mm_div_ps(s, ftwos);
425 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
426 cosine = _mm_sub_ps(fones, s);
428 condition1 = _mm_cmpneq_ps(
429 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
430 condition2 = _mm_cmpneq_ps(
431 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
432 _mm_cmplt_ps(aVal, fzeroes));
437 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
439 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
440 _mm_store_ps(bPtr, sine);
445 number = quarterPoints * 4;
446 for (; number < num_points; number++) {
447 *bPtr++ = sinf(*aPtr++);
456#ifndef INCLUDED_volk_32f_sin_32f_u_H
457#define INCLUDED_volk_32f_sin_32f_u_H
459#ifdef LV_HAVE_AVX512F
461#include <immintrin.h>
462static inline void volk_32f_sin_32f_u_avx512f(
float* sinVector,
463 const float* inVector,
464 unsigned int num_points)
466 float* sinPtr = sinVector;
467 const float* inPtr = inVector;
469 unsigned int number = 0;
470 unsigned int sixteenPoints = num_points / 16;
473 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
476 __m512i q, zeros, ones, twos, fours;
478 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
479 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
480 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
481 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
482 ffours = _mm512_set1_ps(4.0);
483 ftwos = _mm512_set1_ps(2.0);
484 fones = _mm512_set1_ps(1.0);
485 zeros = _mm512_setzero_epi32();
486 ones = _mm512_set1_epi32(1);
487 twos = _mm512_set1_epi32(2);
488 fours = _mm512_set1_epi32(4);
490 cp1 = _mm512_set1_ps(1.0);
491 cp2 = _mm512_set1_ps(0.08333333333333333);
492 cp3 = _mm512_set1_ps(0.002777777777777778);
493 cp4 = _mm512_set1_ps(4.96031746031746e-05);
494 cp5 = _mm512_set1_ps(5.511463844797178e-07);
495 __mmask16 condition1, condition2, ltZero;
497 for (; number < sixteenPoints; number++) {
498 aVal = _mm512_loadu_ps(inPtr);
500 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
503 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
505 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
507 s = _mm512_fnmadd_ps(r, pio4A, s);
508 s = _mm512_fnmadd_ps(r, pio4B, s);
509 s = _mm512_fnmadd_ps(r, pio4C, s);
513 _mm512_set1_ps(8.0f));
514 s = _mm512_mul_ps(s, s);
519 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
524 for (
i = 0;
i < 3;
i++) {
525 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
527 s = _mm512_div_ps(s, ftwos);
529 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
530 cosine = _mm512_sub_ps(fones, s);
532 condition1 = _mm512_cmpneq_epi32_mask(
533 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
534 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
535 condition2 = _mm512_kxor(
536 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
538 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
539 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
540 _mm512_storeu_ps(sinPtr, sine);
545 number = sixteenPoints * 16;
546 for (; number < num_points; number++) {
547 *sinPtr++ = sinf(*inPtr++);
552#if LV_HAVE_AVX2 && LV_HAVE_FMA
553#include <immintrin.h>
556volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
558 float* bPtr = bVector;
559 const float* aPtr = aVector;
561 unsigned int number = 0;
562 unsigned int eighthPoints = num_points / 8;
565 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
567 __m256 sine, cosine, condition1, condition2;
568 __m256i q, r, ones, twos, fours;
570 m4pi = _mm256_set1_ps(1.273239545);
571 pio4A = _mm256_set1_ps(0.78515625);
572 pio4B = _mm256_set1_ps(0.241876e-3);
573 ffours = _mm256_set1_ps(4.0);
574 ftwos = _mm256_set1_ps(2.0);
575 fones = _mm256_set1_ps(1.0);
576 fzeroes = _mm256_setzero_ps();
577 ones = _mm256_set1_epi32(1);
578 twos = _mm256_set1_epi32(2);
579 fours = _mm256_set1_epi32(4);
581 cp1 = _mm256_set1_ps(1.0);
582 cp2 = _mm256_set1_ps(0.83333333e-1);
583 cp3 = _mm256_set1_ps(0.2777778e-2);
584 cp4 = _mm256_set1_ps(0.49603e-4);
585 cp5 = _mm256_set1_ps(0.551e-6);
587 for (; number < eighthPoints; number++) {
588 aVal = _mm256_loadu_ps(aPtr);
589 s = _mm256_sub_ps(aVal,
590 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
591 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
592 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
593 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
595 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
596 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
600 _mm256_set1_ps(8.0));
601 s = _mm256_mul_ps(s, s);
606 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
611 for (
i = 0;
i < 3;
i++) {
612 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
614 s = _mm256_div_ps(s, ftwos);
616 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
617 cosine = _mm256_sub_ps(fones, s);
619 condition1 = _mm256_cmp_ps(
620 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
623 condition2 = _mm256_cmp_ps(
625 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
626 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
633 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
634 sine = _mm256_sub_ps(
635 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
636 _mm256_storeu_ps(bPtr, sine);
641 number = eighthPoints * 8;
642 for (; number < num_points; number++) {
643 *bPtr++ = sin(*aPtr++);
650#include <immintrin.h>
653volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
655 float* bPtr = bVector;
656 const float* aPtr = aVector;
658 unsigned int number = 0;
659 unsigned int eighthPoints = num_points / 8;
662 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
664 __m256 sine, cosine, condition1, condition2;
665 __m256i q, r, ones, twos, fours;
667 m4pi = _mm256_set1_ps(1.273239545);
668 pio4A = _mm256_set1_ps(0.78515625);
669 pio4B = _mm256_set1_ps(0.241876e-3);
670 ffours = _mm256_set1_ps(4.0);
671 ftwos = _mm256_set1_ps(2.0);
672 fones = _mm256_set1_ps(1.0);
673 fzeroes = _mm256_setzero_ps();
674 ones = _mm256_set1_epi32(1);
675 twos = _mm256_set1_epi32(2);
676 fours = _mm256_set1_epi32(4);
678 cp1 = _mm256_set1_ps(1.0);
679 cp2 = _mm256_set1_ps(0.83333333e-1);
680 cp3 = _mm256_set1_ps(0.2777778e-2);
681 cp4 = _mm256_set1_ps(0.49603e-4);
682 cp5 = _mm256_set1_ps(0.551e-6);
684 for (; number < eighthPoints; number++) {
685 aVal = _mm256_loadu_ps(aPtr);
686 s = _mm256_sub_ps(aVal,
687 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
688 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
689 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
690 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
692 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
693 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
697 _mm256_set1_ps(8.0));
698 s = _mm256_mul_ps(s, s);
706 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
715 for (
i = 0;
i < 3;
i++) {
716 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
718 s = _mm256_div_ps(s, ftwos);
720 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
721 cosine = _mm256_sub_ps(fones, s);
723 condition1 = _mm256_cmp_ps(
724 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
727 condition2 = _mm256_cmp_ps(
729 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
730 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
737 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
738 sine = _mm256_sub_ps(
739 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
740 _mm256_storeu_ps(bPtr, sine);
745 number = eighthPoints * 8;
746 for (; number < num_points; number++) {
747 *bPtr++ = sin(*aPtr++);
755#include <smmintrin.h>
758volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
760 float* bPtr = bVector;
761 const float* aPtr = aVector;
763 unsigned int number = 0;
764 unsigned int quarterPoints = num_points / 4;
767 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
769 __m128 sine, cosine, condition1, condition2;
770 __m128i q, r, ones, twos, fours;
772 m4pi = _mm_set1_ps(1.273239545);
773 pio4A = _mm_set1_ps(0.78515625);
774 pio4B = _mm_set1_ps(0.241876e-3);
775 ffours = _mm_set1_ps(4.0);
776 ftwos = _mm_set1_ps(2.0);
777 fones = _mm_set1_ps(1.0);
778 fzeroes = _mm_setzero_ps();
779 ones = _mm_set1_epi32(1);
780 twos = _mm_set1_epi32(2);
781 fours = _mm_set1_epi32(4);
783 cp1 = _mm_set1_ps(1.0);
784 cp2 = _mm_set1_ps(0.83333333e-1);
785 cp3 = _mm_set1_ps(0.2777778e-2);
786 cp4 = _mm_set1_ps(0.49603e-4);
787 cp5 = _mm_set1_ps(0.551e-6);
789 for (; number < quarterPoints; number++) {
790 aVal = _mm_loadu_ps(aPtr);
792 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
793 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
794 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
796 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
797 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
800 s, _mm_set1_ps(8.0));
801 s = _mm_mul_ps(s, s);
808 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
816 for (
i = 0;
i < 3;
i++) {
817 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
819 s = _mm_div_ps(s, ftwos);
821 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
822 cosine = _mm_sub_ps(fones, s);
824 condition1 = _mm_cmpneq_ps(
825 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
826 condition2 = _mm_cmpneq_ps(
827 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
828 _mm_cmplt_ps(aVal, fzeroes));
830 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
832 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
833 _mm_storeu_ps(bPtr, sine);
838 number = quarterPoints * 4;
839 for (; number < num_points; number++) {
840 *bPtr++ = sinf(*aPtr++);
847#ifdef LV_HAVE_GENERIC
852 float* bPtr = bVector;
853 const float* aPtr = aVector;
854 unsigned int number = 0;
856 for (number = 0; number < num_points; number++) {
857 *bPtr++ = sinf(*aPtr++);
871 unsigned int number = 0;
872 unsigned int quarter_points = num_points / 4;
873 float* bVectorPtr = bVector;
874 const float* aVectorPtr = aVector;
879 for (number = 0; number < quarter_points; number++) {
880 a_vec = vld1q_f32(aVectorPtr);
884 vst1q_f32(bVectorPtr, b_vec);
891 for (number = quarter_points * 4; number < num_points; number++) {
892 *bVectorPtr++ = sinf(*aVectorPtr++);
899#include <riscv_vector.h>
902volk_32f_sin_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
904 size_t vlmax = __riscv_vsetvlmax_e32m2();
906 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
907 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
908 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
909 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
911 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
912 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
914 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
915 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
916 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
917 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
919 size_t n = num_points;
920 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
921 vl = __riscv_vsetvl_e32m2(n);
922 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
923 vfloat32m2_t s = __riscv_vfabs(v, vl);
924 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
925 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
927 s = __riscv_vfnmsac(s, cPio4a, r, vl);
928 s = __riscv_vfnmsac(s, cPio4b, r, vl);
929 s = __riscv_vfnmsac(s, cPio4c, r, vl);
931 s = __riscv_vfmul(s, 1 / 8.0f, vl);
932 s = __riscv_vfmul(s, s, vl);
934 s = __riscv_vfmsub(s, c5, c4, vl);
935 s = __riscv_vfmadd(s, t, c3, vl);
936 s = __riscv_vfmsub(s, t, c2, vl);
937 s = __riscv_vfmadd(s, t, cf1, vl);
938 s = __riscv_vfmul(s, t, vl);
939 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
940 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
941 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
942 s = __riscv_vfmul(s, 1 / 2.0f, vl);
945 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
946 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
948 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
949 vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
950 __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
953 sine = __riscv_vmerge(sine, cosine, m1, vl);
954 sine = __riscv_vfneg_mu(m2, sine, sine, vl);
956 __riscv_vse32(bVector, sine, vl);
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sin_32f.h:850
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sin_32f.h:869
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vsinq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:249