63#ifndef INCLUDED_volk_32f_cos_32f_a_H
64#define INCLUDED_volk_32f_cos_32f_a_H
69static inline void volk_32f_cos_32f_a_avx512f(
float* cosVector,
70 const float* inVector,
71 unsigned int num_points)
73 float* cosPtr = cosVector;
74 const float* inPtr = inVector;
76 unsigned int number = 0;
77 unsigned int sixteenPoints = num_points / 16;
80 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
82 __m512i q, zeros, ones, twos, fours;
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2;
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
119 _mm512_set1_ps(8.0f));
120 s = _mm512_mul_ps(s, s);
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
130 for (
i = 0;
i < 3;
i++) {
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
133 s = _mm512_div_ps(s, ftwos);
135 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136 cosine = _mm512_sub_ps(fones, s);
139 condition1 = _mm512_cmpneq_epi32_mask(
140 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
143 condition2 = _mm512_cmpneq_epi32_mask(
144 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
145 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
146 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
147 _mm512_store_ps(cosPtr, cosine);
152 number = sixteenPoints * 16;
153 for (; number < num_points; number++) {
154 *cosPtr++ = cosf(*inPtr++);
159#if LV_HAVE_AVX2 && LV_HAVE_FMA
160#include <immintrin.h>
163volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
165 float* bPtr = bVector;
166 const float* aPtr = aVector;
168 unsigned int number = 0;
169 unsigned int eighthPoints = num_points / 8;
172 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
175 __m256i q, ones, twos, fours;
177 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
178 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
179 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
180 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
181 ffours = _mm256_set1_ps(4.0);
182 ftwos = _mm256_set1_ps(2.0);
183 fones = _mm256_set1_ps(1.0);
184 fzeroes = _mm256_setzero_ps();
185 __m256i zeroes = _mm256_set1_epi32(0);
186 ones = _mm256_set1_epi32(1);
187 __m256i allones = _mm256_set1_epi32(0xffffffff);
188 twos = _mm256_set1_epi32(2);
189 fours = _mm256_set1_epi32(4);
191 cp1 = _mm256_set1_ps(1.0);
192 cp2 = _mm256_set1_ps(0.08333333333333333);
193 cp3 = _mm256_set1_ps(0.002777777777777778);
194 cp4 = _mm256_set1_ps(4.96031746031746e-05);
195 cp5 = _mm256_set1_ps(5.511463844797178e-07);
199 for (; number < eighthPoints; number++) {
201 aVal = _mm256_load_ps(aPtr);
203 s = _mm256_sub_ps(aVal,
204 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
205 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
207 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
209 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
211 s = _mm256_fnmadd_ps(r, pio4A, s);
212 s = _mm256_fnmadd_ps(r, pio4B, s);
213 s = _mm256_fnmadd_ps(r, pio4C, s);
217 _mm256_set1_ps(8.0));
218 s = _mm256_mul_ps(s, s);
223 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
228 for (
i = 0;
i < 3;
i++) {
229 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
231 s = _mm256_div_ps(s, ftwos);
233 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234 cosine = _mm256_sub_ps(fones, s);
238 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
239 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
242 condition3.int_vec = _mm256_cmpeq_epi32(
243 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
244 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
246 cosine = _mm256_add_ps(
247 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
248 cosine = _mm256_sub_ps(cosine,
249 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
250 condition3.float_vec));
251 _mm256_store_ps(bPtr, cosine);
256 number = eighthPoints * 8;
257 for (; number < num_points; number++) {
258 *bPtr++ = cos(*aPtr++);
265#include <immintrin.h>
268volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
270 float* bPtr = bVector;
271 const float* aPtr = aVector;
273 unsigned int number = 0;
274 unsigned int eighthPoints = num_points / 8;
277 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
280 __m256i q, ones, twos, fours;
282 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
283 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
284 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
285 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
286 ffours = _mm256_set1_ps(4.0);
287 ftwos = _mm256_set1_ps(2.0);
288 fones = _mm256_set1_ps(1.0);
289 fzeroes = _mm256_setzero_ps();
290 __m256i zeroes = _mm256_set1_epi32(0);
291 ones = _mm256_set1_epi32(1);
292 __m256i allones = _mm256_set1_epi32(0xffffffff);
293 twos = _mm256_set1_epi32(2);
294 fours = _mm256_set1_epi32(4);
296 cp1 = _mm256_set1_ps(1.0);
297 cp2 = _mm256_set1_ps(0.08333333333333333);
298 cp3 = _mm256_set1_ps(0.002777777777777778);
299 cp4 = _mm256_set1_ps(4.96031746031746e-05);
300 cp5 = _mm256_set1_ps(5.511463844797178e-07);
304 for (; number < eighthPoints; number++) {
306 aVal = _mm256_load_ps(aPtr);
308 s = _mm256_sub_ps(aVal,
309 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
310 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
312 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
314 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
316 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
317 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
318 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
322 _mm256_set1_ps(8.0));
323 s = _mm256_mul_ps(s, s);
331 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
340 for (
i = 0;
i < 3;
i++) {
341 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
343 s = _mm256_div_ps(s, ftwos);
345 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
346 cosine = _mm256_sub_ps(fones, s);
350 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
351 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
354 condition3.int_vec = _mm256_cmpeq_epi32(
355 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
356 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
358 cosine = _mm256_add_ps(
359 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
360 cosine = _mm256_sub_ps(cosine,
361 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
362 condition3.float_vec));
363 _mm256_store_ps(bPtr, cosine);
368 number = eighthPoints * 8;
369 for (; number < num_points; number++) {
370 *bPtr++ = cos(*aPtr++);
377#include <smmintrin.h>
380volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
382 float* bPtr = bVector;
383 const float* aPtr = aVector;
385 unsigned int number = 0;
386 unsigned int quarterPoints = num_points / 4;
389 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
392 __m128i q, ones, twos, fours;
394 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
395 pio4A = _mm_set1_ps(0.7853981554508209228515625);
396 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
397 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
398 ffours = _mm_set1_ps(4.0);
399 ftwos = _mm_set1_ps(2.0);
400 fones = _mm_set1_ps(1.0);
401 fzeroes = _mm_setzero_ps();
402 __m128i zeroes = _mm_set1_epi32(0);
403 ones = _mm_set1_epi32(1);
404 __m128i allones = _mm_set1_epi32(0xffffffff);
405 twos = _mm_set1_epi32(2);
406 fours = _mm_set1_epi32(4);
408 cp1 = _mm_set1_ps(1.0);
409 cp2 = _mm_set1_ps(0.08333333333333333);
410 cp3 = _mm_set1_ps(0.002777777777777778);
411 cp4 = _mm_set1_ps(4.96031746031746e-05);
412 cp5 = _mm_set1_ps(5.511463844797178e-07);
416 for (; number < quarterPoints; number++) {
418 aVal = _mm_load_ps(aPtr);
421 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
423 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
425 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
427 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
428 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
429 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
432 s, _mm_set1_ps(8.0));
433 s = _mm_mul_ps(s, s);
440 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
448 for (
i = 0;
i < 3;
i++) {
449 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
451 s = _mm_div_ps(s, ftwos);
453 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
454 cosine = _mm_sub_ps(fones, s);
458 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
459 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
463 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
464 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
466 cosine = _mm_add_ps(cosine,
467 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
470 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
471 _mm_store_ps(bPtr, cosine);
476 number = quarterPoints * 4;
477 for (; number < num_points; number++) {
478 *bPtr++ = cosf(*aPtr++);
487#ifndef INCLUDED_volk_32f_cos_32f_u_H
488#define INCLUDED_volk_32f_cos_32f_u_H
490#ifdef LV_HAVE_AVX512F
492#include <immintrin.h>
493static inline void volk_32f_cos_32f_u_avx512f(
float* cosVector,
494 const float* inVector,
495 unsigned int num_points)
497 float* cosPtr = cosVector;
498 const float* inPtr = inVector;
500 unsigned int number = 0;
501 unsigned int sixteenPoints = num_points / 16;
504 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
506 __m512i q, zeros, ones, twos, fours;
508 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
509 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
510 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
511 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
512 ffours = _mm512_set1_ps(4.0);
513 ftwos = _mm512_set1_ps(2.0);
514 fones = _mm512_set1_ps(1.0);
515 zeros = _mm512_setzero_epi32();
516 ones = _mm512_set1_epi32(1);
517 twos = _mm512_set1_epi32(2);
518 fours = _mm512_set1_epi32(4);
520 cp1 = _mm512_set1_ps(1.0);
521 cp2 = _mm512_set1_ps(0.08333333333333333);
522 cp3 = _mm512_set1_ps(0.002777777777777778);
523 cp4 = _mm512_set1_ps(4.96031746031746e-05);
524 cp5 = _mm512_set1_ps(5.511463844797178e-07);
525 __mmask16 condition1, condition2;
526 for (; number < sixteenPoints; number++) {
527 aVal = _mm512_loadu_ps(inPtr);
529 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
532 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
534 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
536 s = _mm512_fnmadd_ps(r, pio4A, s);
537 s = _mm512_fnmadd_ps(r, pio4B, s);
538 s = _mm512_fnmadd_ps(r, pio4C, s);
542 _mm512_set1_ps(8.0f));
543 s = _mm512_mul_ps(s, s);
548 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
553 for (
i = 0;
i < 3;
i++) {
554 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
556 s = _mm512_div_ps(s, ftwos);
558 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
559 cosine = _mm512_sub_ps(fones, s);
562 condition1 = _mm512_cmpneq_epi32_mask(
563 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
566 condition2 = _mm512_cmpneq_epi32_mask(
567 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
569 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
570 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
571 _mm512_storeu_ps(cosPtr, cosine);
576 number = sixteenPoints * 16;
577 for (; number < num_points; number++) {
578 *cosPtr++ = cosf(*inPtr++);
583#if LV_HAVE_AVX2 && LV_HAVE_FMA
584#include <immintrin.h>
587volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
589 float* bPtr = bVector;
590 const float* aPtr = aVector;
592 unsigned int number = 0;
593 unsigned int eighthPoints = num_points / 8;
596 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
599 __m256i q, ones, twos, fours;
601 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
602 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
603 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
604 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
605 ffours = _mm256_set1_ps(4.0);
606 ftwos = _mm256_set1_ps(2.0);
607 fones = _mm256_set1_ps(1.0);
608 fzeroes = _mm256_setzero_ps();
609 __m256i zeroes = _mm256_set1_epi32(0);
610 ones = _mm256_set1_epi32(1);
611 __m256i allones = _mm256_set1_epi32(0xffffffff);
612 twos = _mm256_set1_epi32(2);
613 fours = _mm256_set1_epi32(4);
615 cp1 = _mm256_set1_ps(1.0);
616 cp2 = _mm256_set1_ps(0.08333333333333333);
617 cp3 = _mm256_set1_ps(0.002777777777777778);
618 cp4 = _mm256_set1_ps(4.96031746031746e-05);
619 cp5 = _mm256_set1_ps(5.511463844797178e-07);
623 for (; number < eighthPoints; number++) {
625 aVal = _mm256_loadu_ps(aPtr);
627 s = _mm256_sub_ps(aVal,
628 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
629 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
631 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
633 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
635 s = _mm256_fnmadd_ps(r, pio4A, s);
636 s = _mm256_fnmadd_ps(r, pio4B, s);
637 s = _mm256_fnmadd_ps(r, pio4C, s);
641 _mm256_set1_ps(8.0));
642 s = _mm256_mul_ps(s, s);
647 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
652 for (
i = 0;
i < 3;
i++) {
653 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
655 s = _mm256_div_ps(s, ftwos);
657 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
658 cosine = _mm256_sub_ps(fones, s);
662 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
663 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
666 condition3.int_vec = _mm256_cmpeq_epi32(
667 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
668 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
670 cosine = _mm256_add_ps(
671 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
672 cosine = _mm256_sub_ps(cosine,
673 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
674 condition3.float_vec));
675 _mm256_storeu_ps(bPtr, cosine);
680 number = eighthPoints * 8;
681 for (; number < num_points; number++) {
682 *bPtr++ = cos(*aPtr++);
689#include <immintrin.h>
692volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
694 float* bPtr = bVector;
695 const float* aPtr = aVector;
697 unsigned int number = 0;
698 unsigned int eighthPoints = num_points / 8;
701 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
704 __m256i q, ones, twos, fours;
706 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
707 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
708 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
709 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
710 ffours = _mm256_set1_ps(4.0);
711 ftwos = _mm256_set1_ps(2.0);
712 fones = _mm256_set1_ps(1.0);
713 fzeroes = _mm256_setzero_ps();
714 __m256i zeroes = _mm256_set1_epi32(0);
715 ones = _mm256_set1_epi32(1);
716 __m256i allones = _mm256_set1_epi32(0xffffffff);
717 twos = _mm256_set1_epi32(2);
718 fours = _mm256_set1_epi32(4);
720 cp1 = _mm256_set1_ps(1.0);
721 cp2 = _mm256_set1_ps(0.08333333333333333);
722 cp3 = _mm256_set1_ps(0.002777777777777778);
723 cp4 = _mm256_set1_ps(4.96031746031746e-05);
724 cp5 = _mm256_set1_ps(5.511463844797178e-07);
728 for (; number < eighthPoints; number++) {
730 aVal = _mm256_loadu_ps(aPtr);
732 s = _mm256_sub_ps(aVal,
733 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
734 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
736 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
738 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
740 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
741 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
742 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
746 _mm256_set1_ps(8.0));
747 s = _mm256_mul_ps(s, s);
755 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
764 for (
i = 0;
i < 3;
i++) {
765 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
767 s = _mm256_div_ps(s, ftwos);
769 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
770 cosine = _mm256_sub_ps(fones, s);
774 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
775 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
778 condition3.int_vec = _mm256_cmpeq_epi32(
779 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
780 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
782 cosine = _mm256_add_ps(
783 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
784 cosine = _mm256_sub_ps(cosine,
785 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
786 condition3.float_vec));
787 _mm256_storeu_ps(bPtr, cosine);
792 number = eighthPoints * 8;
793 for (; number < num_points; number++) {
794 *bPtr++ = cos(*aPtr++);
801#include <smmintrin.h>
804volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
806 float* bPtr = bVector;
807 const float* aPtr = aVector;
809 unsigned int number = 0;
810 unsigned int quarterPoints = num_points / 4;
813 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
815 __m128 sine, cosine, condition1, condition3;
816 __m128i q, r, ones, twos, fours;
818 m4pi = _mm_set1_ps(1.273239545);
819 pio4A = _mm_set1_ps(0.78515625);
820 pio4B = _mm_set1_ps(0.241876e-3);
821 ffours = _mm_set1_ps(4.0);
822 ftwos = _mm_set1_ps(2.0);
823 fones = _mm_set1_ps(1.0);
824 fzeroes = _mm_setzero_ps();
825 ones = _mm_set1_epi32(1);
826 twos = _mm_set1_epi32(2);
827 fours = _mm_set1_epi32(4);
829 cp1 = _mm_set1_ps(1.0);
830 cp2 = _mm_set1_ps(0.83333333e-1);
831 cp3 = _mm_set1_ps(0.2777778e-2);
832 cp4 = _mm_set1_ps(0.49603e-4);
833 cp5 = _mm_set1_ps(0.551e-6);
835 for (; number < quarterPoints; number++) {
836 aVal = _mm_loadu_ps(aPtr);
838 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
839 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
840 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
842 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
843 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
846 s, _mm_set1_ps(8.0));
847 s = _mm_mul_ps(s, s);
854 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
862 for (
i = 0;
i < 3;
i++) {
863 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
865 s = _mm_div_ps(s, ftwos);
867 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
868 cosine = _mm_sub_ps(fones, s);
870 condition1 = _mm_cmpneq_ps(
871 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
873 condition3 = _mm_cmpneq_ps(
874 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
876 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
878 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
879 _mm_storeu_ps(bPtr, cosine);
884 number = quarterPoints * 4;
885 for (; number < num_points; number++) {
886 *bPtr++ = cosf(*aPtr++);
893#ifdef LV_HAVE_GENERIC
901 const float* aVector,
902 unsigned int num_points)
904 float* bPtr = bVector;
905 const float* aPtr = aVector;
907 float m4pi = 1.273239544735162542821171882678754627704620361328125;
908 float pio4A = 0.7853981554508209228515625;
909 float pio4B = 0.794662735614792836713604629039764404296875e-8;
910 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
914 for (number = 0; number < num_points; number++) {
915 float s = fabs(*aPtr);
916 int q = (int)(s * m4pi);
924 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
929 for (
i = 0;
i < N; ++
i) {
934 float sine = sqrt((2.0 - s) * s);
935 float cosine = 1 - s;
937 if (((q + 1) & 2) != 0) {
942 if (((q + 2) & 4) != 0) {
954#ifdef LV_HAVE_GENERIC
959 float* bPtr = bVector;
960 const float* aPtr = aVector;
961 unsigned int number = 0;
963 for (; number < num_points; number++) {
964 *bPtr++ = cosf(*aPtr++);
978 unsigned int number = 0;
979 unsigned int quarter_points = num_points / 4;
980 float* bVectorPtr = bVector;
981 const float* aVectorPtr = aVector;
986 for (number = 0; number < quarter_points; number++) {
987 a_vec = vld1q_f32(aVectorPtr);
991 vst1q_f32(bVectorPtr, b_vec);
998 for (number = quarter_points * 4; number < num_points; number++) {
999 *bVectorPtr++ = cosf(*aVectorPtr++);
1006#include <riscv_vector.h>
1009volk_32f_cos_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
1011 size_t vlmax = __riscv_vsetvlmax_e32m2();
1013 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
1014 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
1015 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
1016 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
1018 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
1019 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
1021 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
1022 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
1023 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
1024 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
1026 size_t n = num_points;
1027 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
1028 vl = __riscv_vsetvl_e32m2(n);
1029 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
1030 vfloat32m2_t s = __riscv_vfabs(v, vl);
1031 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
1032 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
1034 s = __riscv_vfnmsac(s, cPio4a, r, vl);
1035 s = __riscv_vfnmsac(s, cPio4b, r, vl);
1036 s = __riscv_vfnmsac(s, cPio4c, r, vl);
1038 s = __riscv_vfmul(s, 1 / 8.0f, vl);
1039 s = __riscv_vfmul(s, s, vl);
1041 s = __riscv_vfmsub(s, c5, c4, vl);
1042 s = __riscv_vfmadd(s, t, c3, vl);
1043 s = __riscv_vfmsub(s, t, c2, vl);
1044 s = __riscv_vfmadd(s, t, cf1, vl);
1045 s = __riscv_vfmul(s, t, vl);
1046 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1047 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1048 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1049 s = __riscv_vfmul(s, 1 / 2.0f, vl);
1052 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
1053 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
1055 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
1056 vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
1058 cosine = __riscv_vmerge(cosine, sine, m1, vl);
1059 cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
1061 __riscv_vse32(bVector, cosine, vl);
Definition volk_common.h:116
Definition volk_common.h:133
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:957
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:976
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:900
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vcosq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:255