Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_sin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58
59#include <inttypes.h>
60#include <math.h>
61#include <stdio.h>
62
63#ifndef INCLUDED_volk_32f_sin_32f_a_H
64#define INCLUDED_volk_32f_sin_32f_a_H
65#ifdef LV_HAVE_AVX512F
66
67#include <immintrin.h>
68static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
69 const float* inVector,
70 unsigned int num_points)
71{
72 float* sinPtr = sinVector;
73 const float* inPtr = inVector;
74
75 unsigned int number = 0;
76 unsigned int sixteenPoints = num_points / 16;
77 unsigned int i = 0;
78
79 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
80 fones;
81 __m512 sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2, ltZero;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++) {
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 }
133 s = _mm512_div_ps(s, ftwos);
134
135 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136 cosine = _mm512_sub_ps(fones, s);
137
138 condition1 = _mm512_cmpneq_epi32_mask(
139 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
141 condition2 = _mm512_kxor(
142 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
143
144 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
145 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
146 _mm512_store_ps(sinPtr, sine);
147 inPtr += 16;
148 sinPtr += 16;
149 }
150
151 number = sixteenPoints * 16;
152 for (; number < num_points; number++) {
153 *sinPtr++ = sinf(*inPtr++);
154 }
155}
156#endif
157#if LV_HAVE_AVX2 && LV_HAVE_FMA
158#include <immintrin.h>
159
160static inline void
161volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
162{
163 float* bPtr = bVector;
164 const float* aPtr = aVector;
165
166 unsigned int number = 0;
167 unsigned int eighthPoints = num_points / 8;
168 unsigned int i = 0;
169
170 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
171 fzeroes;
172 __m256 sine, cosine, condition1, condition2;
173 __m256i q, r, ones, twos, fours;
174
175 m4pi = _mm256_set1_ps(1.273239545);
176 pio4A = _mm256_set1_ps(0.78515625);
177 pio4B = _mm256_set1_ps(0.241876e-3);
178 ffours = _mm256_set1_ps(4.0);
179 ftwos = _mm256_set1_ps(2.0);
180 fones = _mm256_set1_ps(1.0);
181 fzeroes = _mm256_setzero_ps();
182 ones = _mm256_set1_epi32(1);
183 twos = _mm256_set1_epi32(2);
184 fours = _mm256_set1_epi32(4);
185
186 cp1 = _mm256_set1_ps(1.0);
187 cp2 = _mm256_set1_ps(0.83333333e-1);
188 cp3 = _mm256_set1_ps(0.2777778e-2);
189 cp4 = _mm256_set1_ps(0.49603e-4);
190 cp5 = _mm256_set1_ps(0.551e-6);
191
192 for (; number < eighthPoints; number++) {
193 aVal = _mm256_load_ps(aPtr);
194 s = _mm256_sub_ps(aVal,
195 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
196 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
197 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
198 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
199
200 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
201 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
202
203 s = _mm256_div_ps(
204 s,
205 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
206 s = _mm256_mul_ps(s, s);
207 // Evaluate Taylor series
208 s = _mm256_mul_ps(
209 _mm256_fmadd_ps(
210 _mm256_fmsub_ps(
211 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
212 s,
213 cp1),
214 s);
215
216 for (i = 0; i < 3; i++) {
217 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
218 }
219 s = _mm256_div_ps(s, ftwos);
220
221 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
222 cosine = _mm256_sub_ps(fones, s);
223
224 condition1 = _mm256_cmp_ps(
225 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
226 fzeroes,
227 _CMP_NEQ_UQ);
228 condition2 = _mm256_cmp_ps(
229 _mm256_cmp_ps(
230 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
231 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
232 _CMP_NEQ_UQ);
233 // Need this condition only for cos
234 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
235 // twos), fours)), fzeroes);
236
237 sine =
238 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
239 sine = _mm256_sub_ps(
240 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
241 _mm256_store_ps(bPtr, sine);
242 aPtr += 8;
243 bPtr += 8;
244 }
245
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *bPtr++ = sin(*aPtr++);
249 }
250}
251
252#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
253
254#ifdef LV_HAVE_AVX2
255#include <immintrin.h>
256
257static inline void
258volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
259{
260 float* bPtr = bVector;
261 const float* aPtr = aVector;
262
263 unsigned int number = 0;
264 unsigned int eighthPoints = num_points / 8;
265 unsigned int i = 0;
266
267 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
268 fzeroes;
269 __m256 sine, cosine, condition1, condition2;
270 __m256i q, r, ones, twos, fours;
271
272 m4pi = _mm256_set1_ps(1.273239545);
273 pio4A = _mm256_set1_ps(0.78515625);
274 pio4B = _mm256_set1_ps(0.241876e-3);
275 ffours = _mm256_set1_ps(4.0);
276 ftwos = _mm256_set1_ps(2.0);
277 fones = _mm256_set1_ps(1.0);
278 fzeroes = _mm256_setzero_ps();
279 ones = _mm256_set1_epi32(1);
280 twos = _mm256_set1_epi32(2);
281 fours = _mm256_set1_epi32(4);
282
283 cp1 = _mm256_set1_ps(1.0);
284 cp2 = _mm256_set1_ps(0.83333333e-1);
285 cp3 = _mm256_set1_ps(0.2777778e-2);
286 cp4 = _mm256_set1_ps(0.49603e-4);
287 cp5 = _mm256_set1_ps(0.551e-6);
288
289 for (; number < eighthPoints; number++) {
290 aVal = _mm256_load_ps(aPtr);
291 s = _mm256_sub_ps(aVal,
292 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
293 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
294 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
295 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
296
297 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
298 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
299
300 s = _mm256_div_ps(
301 s,
302 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
303 s = _mm256_mul_ps(s, s);
304 // Evaluate Taylor series
305 s = _mm256_mul_ps(
306 _mm256_add_ps(
307 _mm256_mul_ps(
308 _mm256_sub_ps(
309 _mm256_mul_ps(
310 _mm256_add_ps(
311 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
312 s),
313 cp3),
314 s),
315 cp2),
316 s),
317 cp1),
318 s);
319
320 for (i = 0; i < 3; i++) {
321 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
322 }
323 s = _mm256_div_ps(s, ftwos);
324
325 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
326 cosine = _mm256_sub_ps(fones, s);
327
328 condition1 = _mm256_cmp_ps(
329 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
330 fzeroes,
331 _CMP_NEQ_UQ);
332 condition2 = _mm256_cmp_ps(
333 _mm256_cmp_ps(
334 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
335 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
336 _CMP_NEQ_UQ);
337 // Need this condition only for cos
338 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
339 // twos), fours)), fzeroes);
340
341 sine =
342 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
343 sine = _mm256_sub_ps(
344 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
345 _mm256_store_ps(bPtr, sine);
346 aPtr += 8;
347 bPtr += 8;
348 }
349
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *bPtr++ = sin(*aPtr++);
353 }
354}
355
356#endif /* LV_HAVE_AVX2 for aligned */
357
358#ifdef LV_HAVE_SSE4_1
359#include <smmintrin.h>
360
361static inline void
362volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
363{
364 float* bPtr = bVector;
365 const float* aPtr = aVector;
366
367 unsigned int number = 0;
368 unsigned int quarterPoints = num_points / 4;
369 unsigned int i = 0;
370
371 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
372 fzeroes;
373 __m128 sine, cosine, condition1, condition2;
374 __m128i q, r, ones, twos, fours;
375
376 m4pi = _mm_set1_ps(1.273239545);
377 pio4A = _mm_set1_ps(0.78515625);
378 pio4B = _mm_set1_ps(0.241876e-3);
379 ffours = _mm_set1_ps(4.0);
380 ftwos = _mm_set1_ps(2.0);
381 fones = _mm_set1_ps(1.0);
382 fzeroes = _mm_setzero_ps();
383 ones = _mm_set1_epi32(1);
384 twos = _mm_set1_epi32(2);
385 fours = _mm_set1_epi32(4);
386
387 cp1 = _mm_set1_ps(1.0);
388 cp2 = _mm_set1_ps(0.83333333e-1);
389 cp3 = _mm_set1_ps(0.2777778e-2);
390 cp4 = _mm_set1_ps(0.49603e-4);
391 cp5 = _mm_set1_ps(0.551e-6);
392
393 for (; number < quarterPoints; number++) {
394 aVal = _mm_load_ps(aPtr);
395 s = _mm_sub_ps(aVal,
396 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
397 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
398 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
399
400 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
401 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
402
403 s = _mm_div_ps(
404 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
405 s = _mm_mul_ps(s, s);
406 // Evaluate Taylor series
407 s = _mm_mul_ps(
408 _mm_add_ps(
409 _mm_mul_ps(
410 _mm_sub_ps(
411 _mm_mul_ps(
412 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
413 cp3),
414 s),
415 cp2),
416 s),
417 cp1),
418 s);
419
420 for (i = 0; i < 3; i++) {
421 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
422 }
423 s = _mm_div_ps(s, ftwos);
424
425 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
426 cosine = _mm_sub_ps(fones, s);
427
428 condition1 = _mm_cmpneq_ps(
429 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
430 condition2 = _mm_cmpneq_ps(
431 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
432 _mm_cmplt_ps(aVal, fzeroes));
433 // Need this condition only for cos
434 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
435 // twos), fours)), fzeroes);
436
437 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
438 sine =
439 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
440 _mm_store_ps(bPtr, sine);
441 aPtr += 4;
442 bPtr += 4;
443 }
444
445 number = quarterPoints * 4;
446 for (; number < num_points; number++) {
447 *bPtr++ = sinf(*aPtr++);
448 }
449}
450
451#endif /* LV_HAVE_SSE4_1 for aligned */
452
453
454#endif /* INCLUDED_volk_32f_sin_32f_a_H */
455
456#ifndef INCLUDED_volk_32f_sin_32f_u_H
457#define INCLUDED_volk_32f_sin_32f_u_H
458
459#ifdef LV_HAVE_AVX512F
460
461#include <immintrin.h>
462static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
463 const float* inVector,
464 unsigned int num_points)
465{
466 float* sinPtr = sinVector;
467 const float* inPtr = inVector;
468
469 unsigned int number = 0;
470 unsigned int sixteenPoints = num_points / 16;
471 unsigned int i = 0;
472
473 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
474 fones;
475 __m512 sine, cosine;
476 __m512i q, zeros, ones, twos, fours;
477
478 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
479 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
480 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
481 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
482 ffours = _mm512_set1_ps(4.0);
483 ftwos = _mm512_set1_ps(2.0);
484 fones = _mm512_set1_ps(1.0);
485 zeros = _mm512_setzero_epi32();
486 ones = _mm512_set1_epi32(1);
487 twos = _mm512_set1_epi32(2);
488 fours = _mm512_set1_epi32(4);
489
490 cp1 = _mm512_set1_ps(1.0);
491 cp2 = _mm512_set1_ps(0.08333333333333333);
492 cp3 = _mm512_set1_ps(0.002777777777777778);
493 cp4 = _mm512_set1_ps(4.96031746031746e-05);
494 cp5 = _mm512_set1_ps(5.511463844797178e-07);
495 __mmask16 condition1, condition2, ltZero;
496
497 for (; number < sixteenPoints; number++) {
498 aVal = _mm512_loadu_ps(inPtr);
499 // s = fabs(aVal)
500 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
501
502 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
503 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
504 // r = q + q&1, q indicates quadrant, r gives
505 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
506
507 s = _mm512_fnmadd_ps(r, pio4A, s);
508 s = _mm512_fnmadd_ps(r, pio4B, s);
509 s = _mm512_fnmadd_ps(r, pio4C, s);
510
511 s = _mm512_div_ps(
512 s,
513 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
514 s = _mm512_mul_ps(s, s);
515 // Evaluate Taylor series
516 s = _mm512_mul_ps(
517 _mm512_fmadd_ps(
518 _mm512_fmsub_ps(
519 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
520 s,
521 cp1),
522 s);
523
524 for (i = 0; i < 3; i++) {
525 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
526 }
527 s = _mm512_div_ps(s, ftwos);
528
529 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
530 cosine = _mm512_sub_ps(fones, s);
531
532 condition1 = _mm512_cmpneq_epi32_mask(
533 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
534 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
535 condition2 = _mm512_kxor(
536 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
537
538 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
539 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
540 _mm512_storeu_ps(sinPtr, sine);
541 inPtr += 16;
542 sinPtr += 16;
543 }
544
545 number = sixteenPoints * 16;
546 for (; number < num_points; number++) {
547 *sinPtr++ = sinf(*inPtr++);
548 }
549}
550#endif
551
552#if LV_HAVE_AVX2 && LV_HAVE_FMA
553#include <immintrin.h>
554
555static inline void
556volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
557{
558 float* bPtr = bVector;
559 const float* aPtr = aVector;
560
561 unsigned int number = 0;
562 unsigned int eighthPoints = num_points / 8;
563 unsigned int i = 0;
564
565 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
566 fzeroes;
567 __m256 sine, cosine, condition1, condition2;
568 __m256i q, r, ones, twos, fours;
569
570 m4pi = _mm256_set1_ps(1.273239545);
571 pio4A = _mm256_set1_ps(0.78515625);
572 pio4B = _mm256_set1_ps(0.241876e-3);
573 ffours = _mm256_set1_ps(4.0);
574 ftwos = _mm256_set1_ps(2.0);
575 fones = _mm256_set1_ps(1.0);
576 fzeroes = _mm256_setzero_ps();
577 ones = _mm256_set1_epi32(1);
578 twos = _mm256_set1_epi32(2);
579 fours = _mm256_set1_epi32(4);
580
581 cp1 = _mm256_set1_ps(1.0);
582 cp2 = _mm256_set1_ps(0.83333333e-1);
583 cp3 = _mm256_set1_ps(0.2777778e-2);
584 cp4 = _mm256_set1_ps(0.49603e-4);
585 cp5 = _mm256_set1_ps(0.551e-6);
586
587 for (; number < eighthPoints; number++) {
588 aVal = _mm256_loadu_ps(aPtr);
589 s = _mm256_sub_ps(aVal,
590 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
591 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
592 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
593 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
594
595 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
596 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
597
598 s = _mm256_div_ps(
599 s,
600 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
601 s = _mm256_mul_ps(s, s);
602 // Evaluate Taylor series
603 s = _mm256_mul_ps(
604 _mm256_fmadd_ps(
605 _mm256_fmsub_ps(
606 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
607 s,
608 cp1),
609 s);
610
611 for (i = 0; i < 3; i++) {
612 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
613 }
614 s = _mm256_div_ps(s, ftwos);
615
616 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
617 cosine = _mm256_sub_ps(fones, s);
618
619 condition1 = _mm256_cmp_ps(
620 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
621 fzeroes,
622 _CMP_NEQ_UQ);
623 condition2 = _mm256_cmp_ps(
624 _mm256_cmp_ps(
625 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
626 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
627 _CMP_NEQ_UQ);
628 // Need this condition only for cos
629 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
630 // twos), fours)), fzeroes);
631
632 sine =
633 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
634 sine = _mm256_sub_ps(
635 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
636 _mm256_storeu_ps(bPtr, sine);
637 aPtr += 8;
638 bPtr += 8;
639 }
640
641 number = eighthPoints * 8;
642 for (; number < num_points; number++) {
643 *bPtr++ = sin(*aPtr++);
644 }
645}
646
647#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
648
649#ifdef LV_HAVE_AVX2
650#include <immintrin.h>
651
652static inline void
653volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
654{
655 float* bPtr = bVector;
656 const float* aPtr = aVector;
657
658 unsigned int number = 0;
659 unsigned int eighthPoints = num_points / 8;
660 unsigned int i = 0;
661
662 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
663 fzeroes;
664 __m256 sine, cosine, condition1, condition2;
665 __m256i q, r, ones, twos, fours;
666
667 m4pi = _mm256_set1_ps(1.273239545);
668 pio4A = _mm256_set1_ps(0.78515625);
669 pio4B = _mm256_set1_ps(0.241876e-3);
670 ffours = _mm256_set1_ps(4.0);
671 ftwos = _mm256_set1_ps(2.0);
672 fones = _mm256_set1_ps(1.0);
673 fzeroes = _mm256_setzero_ps();
674 ones = _mm256_set1_epi32(1);
675 twos = _mm256_set1_epi32(2);
676 fours = _mm256_set1_epi32(4);
677
678 cp1 = _mm256_set1_ps(1.0);
679 cp2 = _mm256_set1_ps(0.83333333e-1);
680 cp3 = _mm256_set1_ps(0.2777778e-2);
681 cp4 = _mm256_set1_ps(0.49603e-4);
682 cp5 = _mm256_set1_ps(0.551e-6);
683
684 for (; number < eighthPoints; number++) {
685 aVal = _mm256_loadu_ps(aPtr);
686 s = _mm256_sub_ps(aVal,
687 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
688 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
689 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
690 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
691
692 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
693 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
694
695 s = _mm256_div_ps(
696 s,
697 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
698 s = _mm256_mul_ps(s, s);
699 // Evaluate Taylor series
700 s = _mm256_mul_ps(
701 _mm256_add_ps(
702 _mm256_mul_ps(
703 _mm256_sub_ps(
704 _mm256_mul_ps(
705 _mm256_add_ps(
706 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
707 s),
708 cp3),
709 s),
710 cp2),
711 s),
712 cp1),
713 s);
714
715 for (i = 0; i < 3; i++) {
716 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
717 }
718 s = _mm256_div_ps(s, ftwos);
719
720 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
721 cosine = _mm256_sub_ps(fones, s);
722
723 condition1 = _mm256_cmp_ps(
724 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
725 fzeroes,
726 _CMP_NEQ_UQ);
727 condition2 = _mm256_cmp_ps(
728 _mm256_cmp_ps(
729 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
730 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
731 _CMP_NEQ_UQ);
732 // Need this condition only for cos
733 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
734 // twos), fours)), fzeroes);
735
736 sine =
737 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
738 sine = _mm256_sub_ps(
739 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
740 _mm256_storeu_ps(bPtr, sine);
741 aPtr += 8;
742 bPtr += 8;
743 }
744
745 number = eighthPoints * 8;
746 for (; number < num_points; number++) {
747 *bPtr++ = sin(*aPtr++);
748 }
749}
750
751#endif /* LV_HAVE_AVX2 for unaligned */
752
753
754#ifdef LV_HAVE_SSE4_1
755#include <smmintrin.h>
756
757static inline void
758volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
759{
760 float* bPtr = bVector;
761 const float* aPtr = aVector;
762
763 unsigned int number = 0;
764 unsigned int quarterPoints = num_points / 4;
765 unsigned int i = 0;
766
767 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
768 fzeroes;
769 __m128 sine, cosine, condition1, condition2;
770 __m128i q, r, ones, twos, fours;
771
772 m4pi = _mm_set1_ps(1.273239545);
773 pio4A = _mm_set1_ps(0.78515625);
774 pio4B = _mm_set1_ps(0.241876e-3);
775 ffours = _mm_set1_ps(4.0);
776 ftwos = _mm_set1_ps(2.0);
777 fones = _mm_set1_ps(1.0);
778 fzeroes = _mm_setzero_ps();
779 ones = _mm_set1_epi32(1);
780 twos = _mm_set1_epi32(2);
781 fours = _mm_set1_epi32(4);
782
783 cp1 = _mm_set1_ps(1.0);
784 cp2 = _mm_set1_ps(0.83333333e-1);
785 cp3 = _mm_set1_ps(0.2777778e-2);
786 cp4 = _mm_set1_ps(0.49603e-4);
787 cp5 = _mm_set1_ps(0.551e-6);
788
789 for (; number < quarterPoints; number++) {
790 aVal = _mm_loadu_ps(aPtr);
791 s = _mm_sub_ps(aVal,
792 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
793 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
794 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
795
796 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
797 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
798
799 s = _mm_div_ps(
800 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
801 s = _mm_mul_ps(s, s);
802 // Evaluate Taylor series
803 s = _mm_mul_ps(
804 _mm_add_ps(
805 _mm_mul_ps(
806 _mm_sub_ps(
807 _mm_mul_ps(
808 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
809 cp3),
810 s),
811 cp2),
812 s),
813 cp1),
814 s);
815
816 for (i = 0; i < 3; i++) {
817 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
818 }
819 s = _mm_div_ps(s, ftwos);
820
821 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
822 cosine = _mm_sub_ps(fones, s);
823
824 condition1 = _mm_cmpneq_ps(
825 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
826 condition2 = _mm_cmpneq_ps(
827 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
828 _mm_cmplt_ps(aVal, fzeroes));
829
830 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
831 sine =
832 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
833 _mm_storeu_ps(bPtr, sine);
834 aPtr += 4;
835 bPtr += 4;
836 }
837
838 number = quarterPoints * 4;
839 for (; number < num_points; number++) {
840 *bPtr++ = sinf(*aPtr++);
841 }
842}
843
844#endif /* LV_HAVE_SSE4_1 for unaligned */
845
846
847#ifdef LV_HAVE_GENERIC
848
849static inline void
850volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
851{
852 float* bPtr = bVector;
853 const float* aPtr = aVector;
854 unsigned int number = 0;
855
856 for (number = 0; number < num_points; number++) {
857 *bPtr++ = sinf(*aPtr++);
858 }
859}
860
861#endif /* LV_HAVE_GENERIC */
862
863
864#ifdef LV_HAVE_NEON
865#include <arm_neon.h>
867
868static inline void
869volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
870{
871 unsigned int number = 0;
872 unsigned int quarter_points = num_points / 4;
873 float* bVectorPtr = bVector;
874 const float* aVectorPtr = aVector;
875
876 float32x4_t b_vec;
877 float32x4_t a_vec;
878
879 for (number = 0; number < quarter_points; number++) {
880 a_vec = vld1q_f32(aVectorPtr);
881 // Prefetch next one, speeds things up
882 __VOLK_PREFETCH(aVectorPtr + 4);
883 b_vec = _vsinq_f32(a_vec);
884 vst1q_f32(bVectorPtr, b_vec);
885 // move pointers ahead
886 bVectorPtr += 4;
887 aVectorPtr += 4;
888 }
889
890 // Deal with the rest
891 for (number = quarter_points * 4; number < num_points; number++) {
892 *bVectorPtr++ = sinf(*aVectorPtr++);
893 }
894}
895
896#endif /* LV_HAVE_NEON */
897
898#ifdef LV_HAVE_RVV
899#include <riscv_vector.h>
900
901static inline void
902volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
903{
904 size_t vlmax = __riscv_vsetvlmax_e32m2();
905
906 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
907 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
908 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
909 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
910
911 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
912 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
913
914 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
915 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
916 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
917 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
918
919 size_t n = num_points;
920 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
921 vl = __riscv_vsetvl_e32m2(n);
922 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
923 vfloat32m2_t s = __riscv_vfabs(v, vl);
924 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
925 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
926
927 s = __riscv_vfnmsac(s, cPio4a, r, vl);
928 s = __riscv_vfnmsac(s, cPio4b, r, vl);
929 s = __riscv_vfnmsac(s, cPio4c, r, vl);
930
931 s = __riscv_vfmul(s, 1 / 8.0f, vl);
932 s = __riscv_vfmul(s, s, vl);
933 vfloat32m2_t t = s;
934 s = __riscv_vfmsub(s, c5, c4, vl);
935 s = __riscv_vfmadd(s, t, c3, vl);
936 s = __riscv_vfmsub(s, t, c2, vl);
937 s = __riscv_vfmadd(s, t, cf1, vl);
938 s = __riscv_vfmul(s, t, vl);
939 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
940 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
941 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
942 s = __riscv_vfmul(s, 1 / 2.0f, vl);
943
944 vfloat32m2_t sine =
945 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
946 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
947
948 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
949 vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
950 __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
951 vl);
952
953 sine = __riscv_vmerge(sine, cosine, m1, vl);
954 sine = __riscv_vfneg_mu(m2, sine, sine, vl);
955
956 __riscv_vse32(bVector, sine, vl);
957 }
958}
959#endif /*LV_HAVE_RVV*/
960
961#endif /* INCLUDED_volk_32f_sin_32f_u_H */
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sin_32f.h:850
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_sin_32f.h:869
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vsinq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:249