Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_cos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58
59#include <inttypes.h>
60#include <math.h>
61#include <stdio.h>
62
63#ifndef INCLUDED_volk_32f_cos_32f_a_H
64#define INCLUDED_volk_32f_cos_32f_a_H
65
66#ifdef LV_HAVE_AVX512F
67
68#include <immintrin.h>
69static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
70 const float* inVector,
71 unsigned int num_points)
72{
73 float* cosPtr = cosVector;
74 const float* inPtr = inVector;
75
76 unsigned int number = 0;
77 unsigned int sixteenPoints = num_points / 16;
78 unsigned int i = 0;
79
80 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
81 fones, sine, cosine;
82 __m512i q, zeros, ones, twos, fours;
83
84 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88 ffours = _mm512_set1_ps(4.0);
89 ftwos = _mm512_set1_ps(2.0);
90 fones = _mm512_set1_ps(1.0);
91 zeros = _mm512_setzero_epi32();
92 ones = _mm512_set1_epi32(1);
93 twos = _mm512_set1_epi32(2);
94 fours = _mm512_set1_epi32(4);
95
96 cp1 = _mm512_set1_ps(1.0);
97 cp2 = _mm512_set1_ps(0.08333333333333333);
98 cp3 = _mm512_set1_ps(0.002777777777777778);
99 cp4 = _mm512_set1_ps(4.96031746031746e-05);
100 cp5 = _mm512_set1_ps(5.511463844797178e-07);
101 __mmask16 condition1, condition2;
102
103 for (; number < sixteenPoints; number++) {
104 aVal = _mm512_load_ps(inPtr);
105 // s = fabs(aVal)
106 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107
108 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110 // r = q + q&1, q indicates quadrant, r gives
111 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112
113 s = _mm512_fnmadd_ps(r, pio4A, s);
114 s = _mm512_fnmadd_ps(r, pio4B, s);
115 s = _mm512_fnmadd_ps(r, pio4C, s);
116
117 s = _mm512_div_ps(
118 s,
119 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120 s = _mm512_mul_ps(s, s);
121 // Evaluate Taylor series
122 s = _mm512_mul_ps(
123 _mm512_fmadd_ps(
124 _mm512_fmsub_ps(
125 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126 s,
127 cp1),
128 s);
129
130 for (i = 0; i < 3; i++) {
131 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132 }
133 s = _mm512_div_ps(s, ftwos);
134
135 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
136 cosine = _mm512_sub_ps(fones, s);
137
138 // if(((q+1)&2) != 0) { cosine=sine;}
139 condition1 = _mm512_cmpneq_epi32_mask(
140 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
141
142 // if(((q+2)&4) != 0) { cosine = -cosine;}
143 condition2 = _mm512_cmpneq_epi32_mask(
144 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
145 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
146 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
147 _mm512_store_ps(cosPtr, cosine);
148 inPtr += 16;
149 cosPtr += 16;
150 }
151
152 number = sixteenPoints * 16;
153 for (; number < num_points; number++) {
154 *cosPtr++ = cosf(*inPtr++);
155 }
156}
157#endif
158
159#if LV_HAVE_AVX2 && LV_HAVE_FMA
160#include <immintrin.h>
161
162static inline void
163volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
164{
165 float* bPtr = bVector;
166 const float* aPtr = aVector;
167
168 unsigned int number = 0;
169 unsigned int eighthPoints = num_points / 8;
170 unsigned int i = 0;
171
172 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
173 fones, fzeroes;
174 __m256 sine, cosine;
175 __m256i q, ones, twos, fours;
176
177 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
178 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
179 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
180 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
181 ffours = _mm256_set1_ps(4.0);
182 ftwos = _mm256_set1_ps(2.0);
183 fones = _mm256_set1_ps(1.0);
184 fzeroes = _mm256_setzero_ps();
185 __m256i zeroes = _mm256_set1_epi32(0);
186 ones = _mm256_set1_epi32(1);
187 __m256i allones = _mm256_set1_epi32(0xffffffff);
188 twos = _mm256_set1_epi32(2);
189 fours = _mm256_set1_epi32(4);
190
191 cp1 = _mm256_set1_ps(1.0);
192 cp2 = _mm256_set1_ps(0.08333333333333333);
193 cp3 = _mm256_set1_ps(0.002777777777777778);
194 cp4 = _mm256_set1_ps(4.96031746031746e-05);
195 cp5 = _mm256_set1_ps(5.511463844797178e-07);
196 union bit256 condition1;
197 union bit256 condition3;
198
199 for (; number < eighthPoints; number++) {
200
201 aVal = _mm256_load_ps(aPtr);
202 // s = fabs(aVal)
203 s = _mm256_sub_ps(aVal,
204 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
205 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
206 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
207 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
208 // r = q + q&1, q indicates quadrant, r gives
209 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
210
211 s = _mm256_fnmadd_ps(r, pio4A, s);
212 s = _mm256_fnmadd_ps(r, pio4B, s);
213 s = _mm256_fnmadd_ps(r, pio4C, s);
214
215 s = _mm256_div_ps(
216 s,
217 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
218 s = _mm256_mul_ps(s, s);
219 // Evaluate Taylor series
220 s = _mm256_mul_ps(
221 _mm256_fmadd_ps(
222 _mm256_fmsub_ps(
223 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
224 s,
225 cp1),
226 s);
227
228 for (i = 0; i < 3; i++) {
229 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
230 }
231 s = _mm256_div_ps(s, ftwos);
232
233 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234 cosine = _mm256_sub_ps(fones, s);
235
236 // if(((q+1)&2) != 0) { cosine=sine;}
237 condition1.int_vec =
238 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
239 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
240
241 // if(((q+2)&4) != 0) { cosine = -cosine;}
242 condition3.int_vec = _mm256_cmpeq_epi32(
243 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
244 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
245
246 cosine = _mm256_add_ps(
247 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
248 cosine = _mm256_sub_ps(cosine,
249 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
250 condition3.float_vec));
251 _mm256_store_ps(bPtr, cosine);
252 aPtr += 8;
253 bPtr += 8;
254 }
255
256 number = eighthPoints * 8;
257 for (; number < num_points; number++) {
258 *bPtr++ = cos(*aPtr++);
259 }
260}
261
262#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
263
264#ifdef LV_HAVE_AVX2
265#include <immintrin.h>
266
267static inline void
268volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
269{
270 float* bPtr = bVector;
271 const float* aPtr = aVector;
272
273 unsigned int number = 0;
274 unsigned int eighthPoints = num_points / 8;
275 unsigned int i = 0;
276
277 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
278 fones, fzeroes;
279 __m256 sine, cosine;
280 __m256i q, ones, twos, fours;
281
282 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
283 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
284 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
285 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
286 ffours = _mm256_set1_ps(4.0);
287 ftwos = _mm256_set1_ps(2.0);
288 fones = _mm256_set1_ps(1.0);
289 fzeroes = _mm256_setzero_ps();
290 __m256i zeroes = _mm256_set1_epi32(0);
291 ones = _mm256_set1_epi32(1);
292 __m256i allones = _mm256_set1_epi32(0xffffffff);
293 twos = _mm256_set1_epi32(2);
294 fours = _mm256_set1_epi32(4);
295
296 cp1 = _mm256_set1_ps(1.0);
297 cp2 = _mm256_set1_ps(0.08333333333333333);
298 cp3 = _mm256_set1_ps(0.002777777777777778);
299 cp4 = _mm256_set1_ps(4.96031746031746e-05);
300 cp5 = _mm256_set1_ps(5.511463844797178e-07);
301 union bit256 condition1;
302 union bit256 condition3;
303
304 for (; number < eighthPoints; number++) {
305
306 aVal = _mm256_load_ps(aPtr);
307 // s = fabs(aVal)
308 s = _mm256_sub_ps(aVal,
309 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
310 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
311 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
312 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
313 // r = q + q&1, q indicates quadrant, r gives
314 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
315
316 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
317 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
318 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
319
320 s = _mm256_div_ps(
321 s,
322 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
323 s = _mm256_mul_ps(s, s);
324 // Evaluate Taylor series
325 s = _mm256_mul_ps(
326 _mm256_add_ps(
327 _mm256_mul_ps(
328 _mm256_sub_ps(
329 _mm256_mul_ps(
330 _mm256_add_ps(
331 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
332 s),
333 cp3),
334 s),
335 cp2),
336 s),
337 cp1),
338 s);
339
340 for (i = 0; i < 3; i++) {
341 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
342 }
343 s = _mm256_div_ps(s, ftwos);
344
345 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
346 cosine = _mm256_sub_ps(fones, s);
347
348 // if(((q+1)&2) != 0) { cosine=sine;}
349 condition1.int_vec =
350 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
351 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
352
353 // if(((q+2)&4) != 0) { cosine = -cosine;}
354 condition3.int_vec = _mm256_cmpeq_epi32(
355 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
356 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
357
358 cosine = _mm256_add_ps(
359 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
360 cosine = _mm256_sub_ps(cosine,
361 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
362 condition3.float_vec));
363 _mm256_store_ps(bPtr, cosine);
364 aPtr += 8;
365 bPtr += 8;
366 }
367
368 number = eighthPoints * 8;
369 for (; number < num_points; number++) {
370 *bPtr++ = cos(*aPtr++);
371 }
372}
373
374#endif /* LV_HAVE_AVX2 for aligned */
375
376#ifdef LV_HAVE_SSE4_1
377#include <smmintrin.h>
378
379static inline void
380volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
381{
382 float* bPtr = bVector;
383 const float* aPtr = aVector;
384
385 unsigned int number = 0;
386 unsigned int quarterPoints = num_points / 4;
387 unsigned int i = 0;
388
389 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
390 fones, fzeroes;
391 __m128 sine, cosine;
392 __m128i q, ones, twos, fours;
393
394 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
395 pio4A = _mm_set1_ps(0.7853981554508209228515625);
396 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
397 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
398 ffours = _mm_set1_ps(4.0);
399 ftwos = _mm_set1_ps(2.0);
400 fones = _mm_set1_ps(1.0);
401 fzeroes = _mm_setzero_ps();
402 __m128i zeroes = _mm_set1_epi32(0);
403 ones = _mm_set1_epi32(1);
404 __m128i allones = _mm_set1_epi32(0xffffffff);
405 twos = _mm_set1_epi32(2);
406 fours = _mm_set1_epi32(4);
407
408 cp1 = _mm_set1_ps(1.0);
409 cp2 = _mm_set1_ps(0.08333333333333333);
410 cp3 = _mm_set1_ps(0.002777777777777778);
411 cp4 = _mm_set1_ps(4.96031746031746e-05);
412 cp5 = _mm_set1_ps(5.511463844797178e-07);
413 union bit128 condition1;
414 union bit128 condition3;
415
416 for (; number < quarterPoints; number++) {
417
418 aVal = _mm_load_ps(aPtr);
419 // s = fabs(aVal)
420 s = _mm_sub_ps(aVal,
421 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
422 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
423 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
424 // r = q + q&1, q indicates quadrant, r gives
425 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
426
427 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
428 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
429 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
430
431 s = _mm_div_ps(
432 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
433 s = _mm_mul_ps(s, s);
434 // Evaluate Taylor series
435 s = _mm_mul_ps(
436 _mm_add_ps(
437 _mm_mul_ps(
438 _mm_sub_ps(
439 _mm_mul_ps(
440 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
441 cp3),
442 s),
443 cp2),
444 s),
445 cp1),
446 s);
447
448 for (i = 0; i < 3; i++) {
449 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
450 }
451 s = _mm_div_ps(s, ftwos);
452
453 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
454 cosine = _mm_sub_ps(fones, s);
455
456 // if(((q+1)&2) != 0) { cosine=sine;}
457 condition1.int_vec =
458 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
459 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
460
461 // if(((q+2)&4) != 0) { cosine = -cosine;}
462 condition3.int_vec =
463 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
464 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
465
466 cosine = _mm_add_ps(cosine,
467 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
468 cosine = _mm_sub_ps(
469 cosine,
470 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
471 _mm_store_ps(bPtr, cosine);
472 aPtr += 4;
473 bPtr += 4;
474 }
475
476 number = quarterPoints * 4;
477 for (; number < num_points; number++) {
478 *bPtr++ = cosf(*aPtr++);
479 }
480}
481
482#endif /* LV_HAVE_SSE4_1 for aligned */
483
484#endif /* INCLUDED_volk_32f_cos_32f_a_H */
485
486
487#ifndef INCLUDED_volk_32f_cos_32f_u_H
488#define INCLUDED_volk_32f_cos_32f_u_H
489
490#ifdef LV_HAVE_AVX512F
491
492#include <immintrin.h>
493static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
494 const float* inVector,
495 unsigned int num_points)
496{
497 float* cosPtr = cosVector;
498 const float* inPtr = inVector;
499
500 unsigned int number = 0;
501 unsigned int sixteenPoints = num_points / 16;
502 unsigned int i = 0;
503
504 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
505 fones, sine, cosine;
506 __m512i q, zeros, ones, twos, fours;
507
508 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
509 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
510 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
511 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
512 ffours = _mm512_set1_ps(4.0);
513 ftwos = _mm512_set1_ps(2.0);
514 fones = _mm512_set1_ps(1.0);
515 zeros = _mm512_setzero_epi32();
516 ones = _mm512_set1_epi32(1);
517 twos = _mm512_set1_epi32(2);
518 fours = _mm512_set1_epi32(4);
519
520 cp1 = _mm512_set1_ps(1.0);
521 cp2 = _mm512_set1_ps(0.08333333333333333);
522 cp3 = _mm512_set1_ps(0.002777777777777778);
523 cp4 = _mm512_set1_ps(4.96031746031746e-05);
524 cp5 = _mm512_set1_ps(5.511463844797178e-07);
525 __mmask16 condition1, condition2;
526 for (; number < sixteenPoints; number++) {
527 aVal = _mm512_loadu_ps(inPtr);
528 // s = fabs(aVal)
529 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
530
531 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
532 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
533 // r = q + q&1, q indicates quadrant, r gives
534 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
535
536 s = _mm512_fnmadd_ps(r, pio4A, s);
537 s = _mm512_fnmadd_ps(r, pio4B, s);
538 s = _mm512_fnmadd_ps(r, pio4C, s);
539
540 s = _mm512_div_ps(
541 s,
542 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
543 s = _mm512_mul_ps(s, s);
544 // Evaluate Taylor series
545 s = _mm512_mul_ps(
546 _mm512_fmadd_ps(
547 _mm512_fmsub_ps(
548 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
549 s,
550 cp1),
551 s);
552
553 for (i = 0; i < 3; i++) {
554 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
555 }
556 s = _mm512_div_ps(s, ftwos);
557
558 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
559 cosine = _mm512_sub_ps(fones, s);
560
561 // if(((q+1)&2) != 0) { cosine=sine;}
562 condition1 = _mm512_cmpneq_epi32_mask(
563 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
564
565 // if(((q+2)&4) != 0) { cosine = -cosine;}
566 condition2 = _mm512_cmpneq_epi32_mask(
567 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
568
569 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
570 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
571 _mm512_storeu_ps(cosPtr, cosine);
572 inPtr += 16;
573 cosPtr += 16;
574 }
575
576 number = sixteenPoints * 16;
577 for (; number < num_points; number++) {
578 *cosPtr++ = cosf(*inPtr++);
579 }
580}
581#endif
582
583#if LV_HAVE_AVX2 && LV_HAVE_FMA
584#include <immintrin.h>
585
586static inline void
587volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
588{
589 float* bPtr = bVector;
590 const float* aPtr = aVector;
591
592 unsigned int number = 0;
593 unsigned int eighthPoints = num_points / 8;
594 unsigned int i = 0;
595
596 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
597 fones, fzeroes;
598 __m256 sine, cosine;
599 __m256i q, ones, twos, fours;
600
601 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
602 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
603 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
604 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
605 ffours = _mm256_set1_ps(4.0);
606 ftwos = _mm256_set1_ps(2.0);
607 fones = _mm256_set1_ps(1.0);
608 fzeroes = _mm256_setzero_ps();
609 __m256i zeroes = _mm256_set1_epi32(0);
610 ones = _mm256_set1_epi32(1);
611 __m256i allones = _mm256_set1_epi32(0xffffffff);
612 twos = _mm256_set1_epi32(2);
613 fours = _mm256_set1_epi32(4);
614
615 cp1 = _mm256_set1_ps(1.0);
616 cp2 = _mm256_set1_ps(0.08333333333333333);
617 cp3 = _mm256_set1_ps(0.002777777777777778);
618 cp4 = _mm256_set1_ps(4.96031746031746e-05);
619 cp5 = _mm256_set1_ps(5.511463844797178e-07);
620 union bit256 condition1;
621 union bit256 condition3;
622
623 for (; number < eighthPoints; number++) {
624
625 aVal = _mm256_loadu_ps(aPtr);
626 // s = fabs(aVal)
627 s = _mm256_sub_ps(aVal,
628 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
629 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
630 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
631 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
632 // r = q + q&1, q indicates quadrant, r gives
633 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
634
635 s = _mm256_fnmadd_ps(r, pio4A, s);
636 s = _mm256_fnmadd_ps(r, pio4B, s);
637 s = _mm256_fnmadd_ps(r, pio4C, s);
638
639 s = _mm256_div_ps(
640 s,
641 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
642 s = _mm256_mul_ps(s, s);
643 // Evaluate Taylor series
644 s = _mm256_mul_ps(
645 _mm256_fmadd_ps(
646 _mm256_fmsub_ps(
647 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
648 s,
649 cp1),
650 s);
651
652 for (i = 0; i < 3; i++) {
653 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
654 }
655 s = _mm256_div_ps(s, ftwos);
656
657 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
658 cosine = _mm256_sub_ps(fones, s);
659
660 // if(((q+1)&2) != 0) { cosine=sine;}
661 condition1.int_vec =
662 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
663 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
664
665 // if(((q+2)&4) != 0) { cosine = -cosine;}
666 condition3.int_vec = _mm256_cmpeq_epi32(
667 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
668 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
669
670 cosine = _mm256_add_ps(
671 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
672 cosine = _mm256_sub_ps(cosine,
673 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
674 condition3.float_vec));
675 _mm256_storeu_ps(bPtr, cosine);
676 aPtr += 8;
677 bPtr += 8;
678 }
679
680 number = eighthPoints * 8;
681 for (; number < num_points; number++) {
682 *bPtr++ = cos(*aPtr++);
683 }
684}
685
686#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
687
688#ifdef LV_HAVE_AVX2
689#include <immintrin.h>
690
691static inline void
692volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
693{
694 float* bPtr = bVector;
695 const float* aPtr = aVector;
696
697 unsigned int number = 0;
698 unsigned int eighthPoints = num_points / 8;
699 unsigned int i = 0;
700
701 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
702 fones, fzeroes;
703 __m256 sine, cosine;
704 __m256i q, ones, twos, fours;
705
706 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
707 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
708 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
709 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
710 ffours = _mm256_set1_ps(4.0);
711 ftwos = _mm256_set1_ps(2.0);
712 fones = _mm256_set1_ps(1.0);
713 fzeroes = _mm256_setzero_ps();
714 __m256i zeroes = _mm256_set1_epi32(0);
715 ones = _mm256_set1_epi32(1);
716 __m256i allones = _mm256_set1_epi32(0xffffffff);
717 twos = _mm256_set1_epi32(2);
718 fours = _mm256_set1_epi32(4);
719
720 cp1 = _mm256_set1_ps(1.0);
721 cp2 = _mm256_set1_ps(0.08333333333333333);
722 cp3 = _mm256_set1_ps(0.002777777777777778);
723 cp4 = _mm256_set1_ps(4.96031746031746e-05);
724 cp5 = _mm256_set1_ps(5.511463844797178e-07);
725 union bit256 condition1;
726 union bit256 condition3;
727
728 for (; number < eighthPoints; number++) {
729
730 aVal = _mm256_loadu_ps(aPtr);
731 // s = fabs(aVal)
732 s = _mm256_sub_ps(aVal,
733 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
734 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
735 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
736 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
737 // r = q + q&1, q indicates quadrant, r gives
738 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
739
740 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
741 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
742 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
743
744 s = _mm256_div_ps(
745 s,
746 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
747 s = _mm256_mul_ps(s, s);
748 // Evaluate Taylor series
749 s = _mm256_mul_ps(
750 _mm256_add_ps(
751 _mm256_mul_ps(
752 _mm256_sub_ps(
753 _mm256_mul_ps(
754 _mm256_add_ps(
755 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
756 s),
757 cp3),
758 s),
759 cp2),
760 s),
761 cp1),
762 s);
763
764 for (i = 0; i < 3; i++) {
765 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
766 }
767 s = _mm256_div_ps(s, ftwos);
768
769 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
770 cosine = _mm256_sub_ps(fones, s);
771
772 // if(((q+1)&2) != 0) { cosine=sine;}
773 condition1.int_vec =
774 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
775 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
776
777 // if(((q+2)&4) != 0) { cosine = -cosine;}
778 condition3.int_vec = _mm256_cmpeq_epi32(
779 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
780 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
781
782 cosine = _mm256_add_ps(
783 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
784 cosine = _mm256_sub_ps(cosine,
785 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
786 condition3.float_vec));
787 _mm256_storeu_ps(bPtr, cosine);
788 aPtr += 8;
789 bPtr += 8;
790 }
791
792 number = eighthPoints * 8;
793 for (; number < num_points; number++) {
794 *bPtr++ = cos(*aPtr++);
795 }
796}
797
798#endif /* LV_HAVE_AVX2 for unaligned */
799
800#ifdef LV_HAVE_SSE4_1
801#include <smmintrin.h>
802
803static inline void
804volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
805{
806 float* bPtr = bVector;
807 const float* aPtr = aVector;
808
809 unsigned int number = 0;
810 unsigned int quarterPoints = num_points / 4;
811 unsigned int i = 0;
812
813 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
814 fzeroes;
815 __m128 sine, cosine, condition1, condition3;
816 __m128i q, r, ones, twos, fours;
817
818 m4pi = _mm_set1_ps(1.273239545);
819 pio4A = _mm_set1_ps(0.78515625);
820 pio4B = _mm_set1_ps(0.241876e-3);
821 ffours = _mm_set1_ps(4.0);
822 ftwos = _mm_set1_ps(2.0);
823 fones = _mm_set1_ps(1.0);
824 fzeroes = _mm_setzero_ps();
825 ones = _mm_set1_epi32(1);
826 twos = _mm_set1_epi32(2);
827 fours = _mm_set1_epi32(4);
828
829 cp1 = _mm_set1_ps(1.0);
830 cp2 = _mm_set1_ps(0.83333333e-1);
831 cp3 = _mm_set1_ps(0.2777778e-2);
832 cp4 = _mm_set1_ps(0.49603e-4);
833 cp5 = _mm_set1_ps(0.551e-6);
834
835 for (; number < quarterPoints; number++) {
836 aVal = _mm_loadu_ps(aPtr);
837 s = _mm_sub_ps(aVal,
838 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
839 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
840 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
841
842 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
843 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
844
845 s = _mm_div_ps(
846 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
847 s = _mm_mul_ps(s, s);
848 // Evaluate Taylor series
849 s = _mm_mul_ps(
850 _mm_add_ps(
851 _mm_mul_ps(
852 _mm_sub_ps(
853 _mm_mul_ps(
854 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
855 cp3),
856 s),
857 cp2),
858 s),
859 cp1),
860 s);
861
862 for (i = 0; i < 3; i++) {
863 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
864 }
865 s = _mm_div_ps(s, ftwos);
866
867 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
868 cosine = _mm_sub_ps(fones, s);
869
870 condition1 = _mm_cmpneq_ps(
871 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
872
873 condition3 = _mm_cmpneq_ps(
874 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
875
876 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
877 cosine = _mm_sub_ps(
878 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
879 _mm_storeu_ps(bPtr, cosine);
880 aPtr += 4;
881 bPtr += 4;
882 }
883
884 number = quarterPoints * 4;
885 for (; number < num_points; number++) {
886 *bPtr++ = cosf(*aPtr++);
887 }
888}
889
890#endif /* LV_HAVE_SSE4_1 for unaligned */
891
892
893#ifdef LV_HAVE_GENERIC
894
895/*
896 * For derivation see
897 * Shibata, Naoki, "Efficient evaluation methods of elementary functions
898 * suitable for SIMD computation," in Springer-Verlag 2010
899 */
900static inline void volk_32f_cos_32f_generic_fast(float* bVector,
901 const float* aVector,
902 unsigned int num_points)
903{
904 float* bPtr = bVector;
905 const float* aPtr = aVector;
906
907 float m4pi = 1.273239544735162542821171882678754627704620361328125;
908 float pio4A = 0.7853981554508209228515625;
909 float pio4B = 0.794662735614792836713604629039764404296875e-8;
910 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
911 int N = 3; // order of argument reduction
912
913 unsigned int number;
914 for (number = 0; number < num_points; number++) {
915 float s = fabs(*aPtr);
916 int q = (int)(s * m4pi);
917 int r = q + (q & 1);
918 s -= r * pio4A;
919 s -= r * pio4B;
920 s -= r * pio4C;
921
922 s = s * 0.125; // 2^-N (<--3)
923 s = s * s;
924 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
925 1.0) *
926 s;
927
928 int i;
929 for (i = 0; i < N; ++i) {
930 s = (4.0 - s) * s;
931 }
932 s = s / 2.0;
933
934 float sine = sqrt((2.0 - s) * s);
935 float cosine = 1 - s;
936
937 if (((q + 1) & 2) != 0) {
938 s = cosine;
939 cosine = sine;
940 sine = s;
941 }
942 if (((q + 2) & 4) != 0) {
943 cosine = -cosine;
944 }
945 *bPtr = cosine;
946 bPtr++;
947 aPtr++;
948 }
949}
950
951#endif /* LV_HAVE_GENERIC */
952
953
954#ifdef LV_HAVE_GENERIC
955
956static inline void
957volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
958{
959 float* bPtr = bVector;
960 const float* aPtr = aVector;
961 unsigned int number = 0;
962
963 for (; number < num_points; number++) {
964 *bPtr++ = cosf(*aPtr++);
965 }
966}
967
968#endif /* LV_HAVE_GENERIC */
969
970
971#ifdef LV_HAVE_NEON
972#include <arm_neon.h>
974
975static inline void
976volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
977{
978 unsigned int number = 0;
979 unsigned int quarter_points = num_points / 4;
980 float* bVectorPtr = bVector;
981 const float* aVectorPtr = aVector;
982
983 float32x4_t b_vec;
984 float32x4_t a_vec;
985
986 for (number = 0; number < quarter_points; number++) {
987 a_vec = vld1q_f32(aVectorPtr);
988 // Prefetch next one, speeds things up
989 __VOLK_PREFETCH(aVectorPtr + 4);
990 b_vec = _vcosq_f32(a_vec);
991 vst1q_f32(bVectorPtr, b_vec);
992 // move pointers ahead
993 bVectorPtr += 4;
994 aVectorPtr += 4;
995 }
996
997 // Deal with the rest
998 for (number = quarter_points * 4; number < num_points; number++) {
999 *bVectorPtr++ = cosf(*aVectorPtr++);
1000 }
1001}
1002
1003#endif /* LV_HAVE_NEON */
1004
1005#ifdef LV_HAVE_RVV
1006#include <riscv_vector.h>
1007
1008static inline void
1009volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
1010{
1011 size_t vlmax = __riscv_vsetvlmax_e32m2();
1012
1013 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
1014 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
1015 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
1016 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
1017
1018 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
1019 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
1020
1021 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
1022 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
1023 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
1024 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
1025
1026 size_t n = num_points;
1027 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
1028 vl = __riscv_vsetvl_e32m2(n);
1029 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
1030 vfloat32m2_t s = __riscv_vfabs(v, vl);
1031 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
1032 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
1033
1034 s = __riscv_vfnmsac(s, cPio4a, r, vl);
1035 s = __riscv_vfnmsac(s, cPio4b, r, vl);
1036 s = __riscv_vfnmsac(s, cPio4c, r, vl);
1037
1038 s = __riscv_vfmul(s, 1 / 8.0f, vl);
1039 s = __riscv_vfmul(s, s, vl);
1040 vfloat32m2_t t = s;
1041 s = __riscv_vfmsub(s, c5, c4, vl);
1042 s = __riscv_vfmadd(s, t, c3, vl);
1043 s = __riscv_vfmsub(s, t, c2, vl);
1044 s = __riscv_vfmadd(s, t, cf1, vl);
1045 s = __riscv_vfmul(s, t, vl);
1046 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1047 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1048 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
1049 s = __riscv_vfmul(s, 1 / 2.0f, vl);
1050
1051 vfloat32m2_t sine =
1052 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
1053 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
1054
1055 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
1056 vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
1057
1058 cosine = __riscv_vmerge(cosine, sine, m1, vl);
1059 cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
1060
1061 __riscv_vse32(bVector, cosine, vl);
1062 }
1063}
1064#endif /*LV_HAVE_RVV*/
1065
1066#endif /* INCLUDED_volk_32f_cos_32f_u_H */
Definition volk_common.h:116
Definition volk_common.h:133
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:957
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:976
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_cos_32f.h:900
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vcosq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:255