60#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61#define INCLUDED_volk_32f_x2_pow_32f_a_H
68#define POW_POLY_DEGREE 3
70#if LV_HAVE_AVX2 && LV_HAVE_FMA
73#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74#define POLY1_AVX2_FMA(x, c0, c1) \
75 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76#define POLY2_AVX2_FMA(x, c0, c1, c2) \
77 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
85static inline void volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
88 unsigned int num_points)
90 float* cPtr = cVector;
91 const float* bPtr = bVector;
92 const float* aPtr = aVector;
94 unsigned int number = 0;
95 const unsigned int eighthPoints = num_points / 8;
97 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98 __m256 tmp, fx, mask, pow2n, z, y;
99 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101 __m256i bias, exp, emm0, pi32_0x7f;
103 one = _mm256_set1_ps(1.0);
104 exp_hi = _mm256_set1_ps(88.3762626647949);
105 exp_lo = _mm256_set1_ps(-88.3762626647949);
106 ln2 = _mm256_set1_ps(0.6931471805);
107 log2EF = _mm256_set1_ps(1.44269504088896341);
108 half = _mm256_set1_ps(0.5);
109 exp_C1 = _mm256_set1_ps(0.693359375);
110 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111 pi32_0x7f = _mm256_set1_epi32(0x7f);
113 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
120 for (; number < eighthPoints; number++) {
122 aVal = _mm256_load_ps(aPtr);
123 bias = _mm256_set1_epi32(127);
124 leadingOne = _mm256_set1_ps(1.0f);
125 exp = _mm256_sub_epi32(
126 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127 _mm256_set1_epi32(0x7f800000)),
130 logarithm = _mm256_cvtepi32_ps(exp);
134 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
136#if POW_POLY_DEGREE == 6
137 mantissa = POLY5_AVX2_FMA(frac,
144#elif POW_POLY_DEGREE == 5
145 mantissa = POLY4_AVX2_FMA(frac,
146 2.8882704548164776201f,
147 -2.52074962577807006663f,
148 1.48116647521213171641f,
149 -0.465725644288844778798f,
150 0.0596515482674574969533f);
151#elif POW_POLY_DEGREE == 4
152 mantissa = POLY3_AVX2_FMA(frac,
153 2.61761038894603480148f,
154 -1.75647175389045657003f,
155 0.688243882994381274313f,
156 -0.107254423828329604454f);
157#elif POW_POLY_DEGREE == 3
158 mantissa = POLY2_AVX2_FMA(frac,
159 2.28330284476918490682f,
160 -1.04913055217340124191f,
161 0.204446009836232697516f);
166 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167 logarithm = _mm256_mul_ps(logarithm, ln2);
170 bVal = _mm256_load_ps(bPtr);
171 bVal = _mm256_mul_ps(bVal, logarithm);
174 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
176 fx = _mm256_fmadd_ps(bVal, log2EF, half);
178 emm0 = _mm256_cvttps_epi32(fx);
179 tmp = _mm256_cvtepi32_ps(emm0);
181 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182 fx = _mm256_sub_ps(tmp, mask);
184 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186 z = _mm256_mul_ps(bVal, bVal);
188 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189 y = _mm256_fmadd_ps(y, bVal, exp_p2);
190 y = _mm256_fmadd_ps(y, bVal, exp_p3);
191 y = _mm256_fmadd_ps(y, bVal, exp_p4);
192 y = _mm256_fmadd_ps(y, bVal, exp_p5);
193 y = _mm256_fmadd_ps(y, z, bVal);
194 y = _mm256_add_ps(y, one);
197 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
199 pow2n = _mm256_castsi256_ps(emm0);
200 cVal = _mm256_mul_ps(y, pow2n);
202 _mm256_store_ps(cPtr, cVal);
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *cPtr++ = pow(*aPtr++, *bPtr++);
218#include <immintrin.h>
220#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221#define POLY1_AVX2(x, c0, c1) \
222 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223#define POLY2_AVX2(x, c0, c1, c2) \
224 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225#define POLY3_AVX2(x, c0, c1, c2, c3) \
226 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
232static inline void volk_32f_x2_pow_32f_a_avx2(
float* cVector,
233 const float* bVector,
234 const float* aVector,
235 unsigned int num_points)
237 float* cPtr = cVector;
238 const float* bPtr = bVector;
239 const float* aPtr = aVector;
241 unsigned int number = 0;
242 const unsigned int eighthPoints = num_points / 8;
244 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245 __m256 tmp, fx, mask, pow2n, z, y;
246 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248 __m256i bias, exp, emm0, pi32_0x7f;
250 one = _mm256_set1_ps(1.0);
251 exp_hi = _mm256_set1_ps(88.3762626647949);
252 exp_lo = _mm256_set1_ps(-88.3762626647949);
253 ln2 = _mm256_set1_ps(0.6931471805);
254 log2EF = _mm256_set1_ps(1.44269504088896341);
255 half = _mm256_set1_ps(0.5);
256 exp_C1 = _mm256_set1_ps(0.693359375);
257 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258 pi32_0x7f = _mm256_set1_epi32(0x7f);
260 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
267 for (; number < eighthPoints; number++) {
269 aVal = _mm256_load_ps(aPtr);
270 bias = _mm256_set1_epi32(127);
271 leadingOne = _mm256_set1_ps(1.0f);
272 exp = _mm256_sub_epi32(
273 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274 _mm256_set1_epi32(0x7f800000)),
277 logarithm = _mm256_cvtepi32_ps(exp);
281 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
283#if POW_POLY_DEGREE == 6
284 mantissa = POLY5_AVX2(frac,
291#elif POW_POLY_DEGREE == 5
292 mantissa = POLY4_AVX2(frac,
293 2.8882704548164776201f,
294 -2.52074962577807006663f,
295 1.48116647521213171641f,
296 -0.465725644288844778798f,
297 0.0596515482674574969533f);
298#elif POW_POLY_DEGREE == 4
299 mantissa = POLY3_AVX2(frac,
300 2.61761038894603480148f,
301 -1.75647175389045657003f,
302 0.688243882994381274313f,
303 -0.107254423828329604454f);
304#elif POW_POLY_DEGREE == 3
305 mantissa = POLY2_AVX2(frac,
306 2.28330284476918490682f,
307 -1.04913055217340124191f,
308 0.204446009836232697516f);
313 logarithm = _mm256_add_ps(
314 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315 logarithm = _mm256_mul_ps(logarithm, ln2);
318 bVal = _mm256_load_ps(bPtr);
319 bVal = _mm256_mul_ps(bVal, logarithm);
322 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
324 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
326 emm0 = _mm256_cvttps_epi32(fx);
327 tmp = _mm256_cvtepi32_ps(emm0);
329 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330 fx = _mm256_sub_ps(tmp, mask);
332 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334 z = _mm256_mul_ps(bVal, bVal);
336 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342 y = _mm256_add_ps(y, one);
345 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
347 pow2n = _mm256_castsi256_ps(emm0);
348 cVal = _mm256_mul_ps(y, pow2n);
350 _mm256_store_ps(cPtr, cVal);
357 number = eighthPoints * 8;
358 for (; number < num_points; number++) {
359 *cPtr++ = pow(*aPtr++, *bPtr++);
367#include <smmintrin.h>
369#define POLY0(x, c0) _mm_set1_ps(c0)
370#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372#define POLY3(x, c0, c1, c2, c3) \
373 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374#define POLY4(x, c0, c1, c2, c3, c4) \
375 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376#define POLY5(x, c0, c1, c2, c3, c4, c5) \
377 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
379static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
380 const float* bVector,
381 const float* aVector,
382 unsigned int num_points)
384 float* cPtr = cVector;
385 const float* bPtr = bVector;
386 const float* aPtr = aVector;
388 unsigned int number = 0;
389 const unsigned int quarterPoints = num_points / 4;
391 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392 __m128 tmp, fx, mask, pow2n, z, y;
393 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395 __m128i bias, exp, emm0, pi32_0x7f;
397 one = _mm_set1_ps(1.0);
398 exp_hi = _mm_set1_ps(88.3762626647949);
399 exp_lo = _mm_set1_ps(-88.3762626647949);
400 ln2 = _mm_set1_ps(0.6931471805);
401 log2EF = _mm_set1_ps(1.44269504088896341);
402 half = _mm_set1_ps(0.5);
403 exp_C1 = _mm_set1_ps(0.693359375);
404 exp_C2 = _mm_set1_ps(-2.12194440e-4);
405 pi32_0x7f = _mm_set1_epi32(0x7f);
407 exp_p0 = _mm_set1_ps(1.9875691500e-4);
408 exp_p1 = _mm_set1_ps(1.3981999507e-3);
409 exp_p2 = _mm_set1_ps(8.3334519073e-3);
410 exp_p3 = _mm_set1_ps(4.1665795894e-2);
411 exp_p4 = _mm_set1_ps(1.6666665459e-1);
412 exp_p5 = _mm_set1_ps(5.0000001201e-1);
414 for (; number < quarterPoints; number++) {
416 aVal = _mm_load_ps(aPtr);
417 bias = _mm_set1_epi32(127);
418 leadingOne = _mm_set1_ps(1.0f);
421 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
423 logarithm = _mm_cvtepi32_ps(exp);
425 frac = _mm_or_ps(leadingOne,
426 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
428#if POW_POLY_DEGREE == 6
429 mantissa = POLY5(frac,
436#elif POW_POLY_DEGREE == 5
437 mantissa = POLY4(frac,
438 2.8882704548164776201f,
439 -2.52074962577807006663f,
440 1.48116647521213171641f,
441 -0.465725644288844778798f,
442 0.0596515482674574969533f);
443#elif POW_POLY_DEGREE == 4
444 mantissa = POLY3(frac,
445 2.61761038894603480148f,
446 -1.75647175389045657003f,
447 0.688243882994381274313f,
448 -0.107254423828329604454f);
449#elif POW_POLY_DEGREE == 3
450 mantissa = POLY2(frac,
451 2.28330284476918490682f,
452 -1.04913055217340124191f,
453 0.204446009836232697516f);
459 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 logarithm = _mm_mul_ps(logarithm, ln2);
464 bVal = _mm_load_ps(bPtr);
465 bVal = _mm_mul_ps(bVal, logarithm);
468 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
470 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
472 emm0 = _mm_cvttps_epi32(fx);
473 tmp = _mm_cvtepi32_ps(emm0);
475 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476 fx = _mm_sub_ps(tmp, mask);
478 tmp = _mm_mul_ps(fx, exp_C1);
479 z = _mm_mul_ps(fx, exp_C2);
480 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481 z = _mm_mul_ps(bVal, bVal);
483 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487 y = _mm_add_ps(y, one);
489 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
491 pow2n = _mm_castsi128_ps(emm0);
492 cVal = _mm_mul_ps(y, pow2n);
494 _mm_store_ps(cPtr, cVal);
501 number = quarterPoints * 4;
502 for (; number < num_points; number++) {
503 *cPtr++ = powf(*aPtr++, *bPtr++);
511#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512#define INCLUDED_volk_32f_x2_pow_32f_u_H
519#define POW_POLY_DEGREE 3
521#ifdef LV_HAVE_GENERIC
524 const float* bVector,
525 const float* aVector,
526 unsigned int num_points)
528 float* cPtr = cVector;
529 const float* bPtr = bVector;
530 const float* aPtr = aVector;
531 unsigned int number = 0;
533 for (number = 0; number < num_points; number++) {
534 *cPtr++ = powf(*aPtr++, *bPtr++);
541#include <smmintrin.h>
543#define POLY0(x, c0) _mm_set1_ps(c0)
544#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
545#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
546#define POLY3(x, c0, c1, c2, c3) \
547 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
548#define POLY4(x, c0, c1, c2, c3, c4) \
549 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
550#define POLY5(x, c0, c1, c2, c3, c4, c5) \
551 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
553static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
554 const float* bVector,
555 const float* aVector,
556 unsigned int num_points)
558 float* cPtr = cVector;
559 const float* bPtr = bVector;
560 const float* aPtr = aVector;
562 unsigned int number = 0;
563 const unsigned int quarterPoints = num_points / 4;
565 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
566 __m128 tmp, fx, mask, pow2n, z, y;
567 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
568 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
569 __m128i bias, exp, emm0, pi32_0x7f;
571 one = _mm_set1_ps(1.0);
572 exp_hi = _mm_set1_ps(88.3762626647949);
573 exp_lo = _mm_set1_ps(-88.3762626647949);
574 ln2 = _mm_set1_ps(0.6931471805);
575 log2EF = _mm_set1_ps(1.44269504088896341);
576 half = _mm_set1_ps(0.5);
577 exp_C1 = _mm_set1_ps(0.693359375);
578 exp_C2 = _mm_set1_ps(-2.12194440e-4);
579 pi32_0x7f = _mm_set1_epi32(0x7f);
581 exp_p0 = _mm_set1_ps(1.9875691500e-4);
582 exp_p1 = _mm_set1_ps(1.3981999507e-3);
583 exp_p2 = _mm_set1_ps(8.3334519073e-3);
584 exp_p3 = _mm_set1_ps(4.1665795894e-2);
585 exp_p4 = _mm_set1_ps(1.6666665459e-1);
586 exp_p5 = _mm_set1_ps(5.0000001201e-1);
588 for (; number < quarterPoints; number++) {
590 aVal = _mm_loadu_ps(aPtr);
591 bias = _mm_set1_epi32(127);
592 leadingOne = _mm_set1_ps(1.0f);
595 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
597 logarithm = _mm_cvtepi32_ps(exp);
599 frac = _mm_or_ps(leadingOne,
600 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
602#if POW_POLY_DEGREE == 6
603 mantissa = POLY5(frac,
610#elif POW_POLY_DEGREE == 5
611 mantissa = POLY4(frac,
612 2.8882704548164776201f,
613 -2.52074962577807006663f,
614 1.48116647521213171641f,
615 -0.465725644288844778798f,
616 0.0596515482674574969533f);
617#elif POW_POLY_DEGREE == 4
618 mantissa = POLY3(frac,
619 2.61761038894603480148f,
620 -1.75647175389045657003f,
621 0.688243882994381274313f,
622 -0.107254423828329604454f);
623#elif POW_POLY_DEGREE == 3
624 mantissa = POLY2(frac,
625 2.28330284476918490682f,
626 -1.04913055217340124191f,
627 0.204446009836232697516f);
633 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
634 logarithm = _mm_mul_ps(logarithm, ln2);
638 bVal = _mm_loadu_ps(bPtr);
639 bVal = _mm_mul_ps(bVal, logarithm);
642 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
644 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
646 emm0 = _mm_cvttps_epi32(fx);
647 tmp = _mm_cvtepi32_ps(emm0);
649 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
650 fx = _mm_sub_ps(tmp, mask);
652 tmp = _mm_mul_ps(fx, exp_C1);
653 z = _mm_mul_ps(fx, exp_C2);
654 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
655 z = _mm_mul_ps(bVal, bVal);
657 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
658 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
659 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
660 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
661 y = _mm_add_ps(y, one);
663 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
665 pow2n = _mm_castsi128_ps(emm0);
666 cVal = _mm_mul_ps(y, pow2n);
668 _mm_storeu_ps(cPtr, cVal);
675 number = quarterPoints * 4;
676 for (; number < num_points; number++) {
677 *cPtr++ = powf(*aPtr++, *bPtr++);
683#if LV_HAVE_AVX2 && LV_HAVE_FMA
684#include <immintrin.h>
686#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
687#define POLY1_AVX2_FMA(x, c0, c1) \
688 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
689#define POLY2_AVX2_FMA(x, c0, c1, c2) \
690 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
691#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
692 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
693#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
694 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
695#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
696 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
698static inline void volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
699 const float* bVector,
700 const float* aVector,
701 unsigned int num_points)
703 float* cPtr = cVector;
704 const float* bPtr = bVector;
705 const float* aPtr = aVector;
707 unsigned int number = 0;
708 const unsigned int eighthPoints = num_points / 8;
710 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
711 __m256 tmp, fx, mask, pow2n, z, y;
712 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
713 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
714 __m256i bias, exp, emm0, pi32_0x7f;
716 one = _mm256_set1_ps(1.0);
717 exp_hi = _mm256_set1_ps(88.3762626647949);
718 exp_lo = _mm256_set1_ps(-88.3762626647949);
719 ln2 = _mm256_set1_ps(0.6931471805);
720 log2EF = _mm256_set1_ps(1.44269504088896341);
721 half = _mm256_set1_ps(0.5);
722 exp_C1 = _mm256_set1_ps(0.693359375);
723 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
724 pi32_0x7f = _mm256_set1_epi32(0x7f);
726 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
727 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
728 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
729 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
730 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
731 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
733 for (; number < eighthPoints; number++) {
735 aVal = _mm256_loadu_ps(aPtr);
736 bias = _mm256_set1_epi32(127);
737 leadingOne = _mm256_set1_ps(1.0f);
738 exp = _mm256_sub_epi32(
739 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
740 _mm256_set1_epi32(0x7f800000)),
743 logarithm = _mm256_cvtepi32_ps(exp);
747 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
749#if POW_POLY_DEGREE == 6
750 mantissa = POLY5_AVX2_FMA(frac,
757#elif POW_POLY_DEGREE == 5
758 mantissa = POLY4_AVX2_FMA(frac,
759 2.8882704548164776201f,
760 -2.52074962577807006663f,
761 1.48116647521213171641f,
762 -0.465725644288844778798f,
763 0.0596515482674574969533f);
764#elif POW_POLY_DEGREE == 4
765 mantissa = POLY3_AVX2_FMA(frac,
766 2.61761038894603480148f,
767 -1.75647175389045657003f,
768 0.688243882994381274313f,
769 -0.107254423828329604454f);
770#elif POW_POLY_DEGREE == 3
771 mantissa = POLY2_AVX2_FMA(frac,
772 2.28330284476918490682f,
773 -1.04913055217340124191f,
774 0.204446009836232697516f);
779 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
780 logarithm = _mm256_mul_ps(logarithm, ln2);
784 bVal = _mm256_loadu_ps(bPtr);
785 bVal = _mm256_mul_ps(bVal, logarithm);
788 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
790 fx = _mm256_fmadd_ps(bVal, log2EF, half);
792 emm0 = _mm256_cvttps_epi32(fx);
793 tmp = _mm256_cvtepi32_ps(emm0);
795 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
796 fx = _mm256_sub_ps(tmp, mask);
798 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
799 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
800 z = _mm256_mul_ps(bVal, bVal);
802 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
803 y = _mm256_fmadd_ps(y, bVal, exp_p2);
804 y = _mm256_fmadd_ps(y, bVal, exp_p3);
805 y = _mm256_fmadd_ps(y, bVal, exp_p4);
806 y = _mm256_fmadd_ps(y, bVal, exp_p5);
807 y = _mm256_fmadd_ps(y, z, bVal);
808 y = _mm256_add_ps(y, one);
811 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
813 pow2n = _mm256_castsi256_ps(emm0);
814 cVal = _mm256_mul_ps(y, pow2n);
816 _mm256_storeu_ps(cPtr, cVal);
823 number = eighthPoints * 8;
824 for (; number < num_points; number++) {
825 *cPtr++ = pow(*aPtr++, *bPtr++);
832#include <immintrin.h>
834#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
835#define POLY1_AVX2(x, c0, c1) \
836 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
837#define POLY2_AVX2(x, c0, c1, c2) \
838 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
839#define POLY3_AVX2(x, c0, c1, c2, c3) \
840 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
841#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
842 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
843#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
844 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
846static inline void volk_32f_x2_pow_32f_u_avx2(
float* cVector,
847 const float* bVector,
848 const float* aVector,
849 unsigned int num_points)
851 float* cPtr = cVector;
852 const float* bPtr = bVector;
853 const float* aPtr = aVector;
855 unsigned int number = 0;
856 const unsigned int eighthPoints = num_points / 8;
858 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
859 __m256 tmp, fx, mask, pow2n, z, y;
860 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
861 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
862 __m256i bias, exp, emm0, pi32_0x7f;
864 one = _mm256_set1_ps(1.0);
865 exp_hi = _mm256_set1_ps(88.3762626647949);
866 exp_lo = _mm256_set1_ps(-88.3762626647949);
867 ln2 = _mm256_set1_ps(0.6931471805);
868 log2EF = _mm256_set1_ps(1.44269504088896341);
869 half = _mm256_set1_ps(0.5);
870 exp_C1 = _mm256_set1_ps(0.693359375);
871 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
872 pi32_0x7f = _mm256_set1_epi32(0x7f);
874 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
875 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
876 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
877 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
878 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
879 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
881 for (; number < eighthPoints; number++) {
883 aVal = _mm256_loadu_ps(aPtr);
884 bias = _mm256_set1_epi32(127);
885 leadingOne = _mm256_set1_ps(1.0f);
886 exp = _mm256_sub_epi32(
887 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
888 _mm256_set1_epi32(0x7f800000)),
891 logarithm = _mm256_cvtepi32_ps(exp);
895 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
897#if POW_POLY_DEGREE == 6
898 mantissa = POLY5_AVX2(frac,
905#elif POW_POLY_DEGREE == 5
906 mantissa = POLY4_AVX2(frac,
907 2.8882704548164776201f,
908 -2.52074962577807006663f,
909 1.48116647521213171641f,
910 -0.465725644288844778798f,
911 0.0596515482674574969533f);
912#elif POW_POLY_DEGREE == 4
913 mantissa = POLY3_AVX2(frac,
914 2.61761038894603480148f,
915 -1.75647175389045657003f,
916 0.688243882994381274313f,
917 -0.107254423828329604454f);
918#elif POW_POLY_DEGREE == 3
919 mantissa = POLY2_AVX2(frac,
920 2.28330284476918490682f,
921 -1.04913055217340124191f,
922 0.204446009836232697516f);
927 logarithm = _mm256_add_ps(
928 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
929 logarithm = _mm256_mul_ps(logarithm, ln2);
932 bVal = _mm256_loadu_ps(bPtr);
933 bVal = _mm256_mul_ps(bVal, logarithm);
936 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
938 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
940 emm0 = _mm256_cvttps_epi32(fx);
941 tmp = _mm256_cvtepi32_ps(emm0);
943 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
944 fx = _mm256_sub_ps(tmp, mask);
946 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
947 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
948 z = _mm256_mul_ps(bVal, bVal);
950 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
951 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
952 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
953 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
954 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
955 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
956 y = _mm256_add_ps(y, one);
959 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
961 pow2n = _mm256_castsi256_ps(emm0);
962 cVal = _mm256_mul_ps(y, pow2n);
964 _mm256_storeu_ps(cPtr, cVal);
971 number = eighthPoints * 8;
972 for (; number < num_points; number++) {
973 *cPtr++ = pow(*aPtr++, *bPtr++);
980#include <riscv_vector.h>
982static inline void volk_32f_x2_pow_32f_rvv(
float* cVector,
983 const float* bVector,
984 const float* aVector,
985 unsigned int num_points)
987 size_t vlmax = __riscv_vsetvlmax_e32m1();
989#if POW_POLY_DEGREE == 6
990 const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
991 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
992 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
993 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
994 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
995 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
996#elif POW_POLY_DEGREE == 5
997 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
998 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
999 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
1000 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
1001 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
1002#elif POW_POLY_DEGREE == 4
1003 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
1004 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
1005 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
1006 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
1007#elif POW_POLY_DEGREE == 3
1008 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
1009 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
1010 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
1015 const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
1016 const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
1017 const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
1018 const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
1019 const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
1020 const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
1021 const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
1022 const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
1024 const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
1025 const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
1026 const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
1027 const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
1028 const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
1029 const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
1031 const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
1032 const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
1033 const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
1035 size_t n = num_points;
1036 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
1037 vl = __riscv_vsetvl_e32m1(n);
1038 vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
1042 vfloat32m1_t a = __riscv_vfabs(va, vl);
1043 vfloat32m1_t exp = __riscv_vfcvt_f(
1045 __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
1047 vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
1048 __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
1050 vfloat32m1_t mant = cl0;
1051 mant = __riscv_vfmadd(mant, frac, cl1, vl);
1052 mant = __riscv_vfmadd(mant, frac, cl2, vl);
1053#if POW_POLY_DEGREE >= 4
1054 mant = __riscv_vfmadd(mant, frac, cl3, vl);
1055#if POW_POLY_DEGREE >= 5
1056 mant = __riscv_vfmadd(mant, frac, cl4, vl);
1057#if POW_POLY_DEGREE >= 6
1058 mant = __riscv_vfmadd(mant, frac, cl5, vl);
1062 log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
1063 log = __riscv_vfmul(log, ln2, vl);
1066 vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
1067 vb = __riscv_vfmul(vb, log, vl);
1071 vb = __riscv_vfmin(vb, exp_hi, vl);
1072 vb = __riscv_vfmax(vb, exp_lo, vl);
1073 vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
1075 vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
1076 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
1077 vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
1078 vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
1079 vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
1081 vfloat32m1_t y = ce0;
1082 y = __riscv_vfmadd(y, vb, ce1, vl);
1083 y = __riscv_vfmadd(y, vb, ce2, vl);
1084 y = __riscv_vfmadd(y, vb, ce3, vl);
1085 y = __riscv_vfmadd(y, vb, ce4, vl);
1086 y = __riscv_vfmadd(y, vb, ce5, vl);
1087 y = __riscv_vfmadd(y, vv, vb, vl);
1088 y = __riscv_vfadd(y, cf1, vl);
1090 vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
1091 __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
1093 exp = __riscv_vfmul(y, pow2n, vl);
1096 __riscv_vse32(cVector, exp, vl);
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_x2_pow_32f.h:523