79#ifndef INCLUDED_volk_32f_log2_32f_a_H
80#define INCLUDED_volk_32f_log2_32f_a_H
87#define LOG_POLY_DEGREE 6
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
98 for (number = 0; number < num_points; number++) {
104#if LV_HAVE_AVX2 && LV_HAVE_FMA
105#include <immintrin.h>
107#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
108#define POLY1_FMAAVX2(x, c0, c1) \
109 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
110#define POLY2_FMAAVX2(x, c0, c1, c2) \
111 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
112#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
113 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
114#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
115 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
116#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
117 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
119static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
120 const float* aVector,
121 unsigned int num_points)
123 float* bPtr = bVector;
124 const float* aPtr = aVector;
126 unsigned int number = 0;
127 const unsigned int eighthPoints = num_points / 8;
129 __m256 aVal, bVal, mantissa, frac, leadingOne;
132 for (; number < eighthPoints; number++) {
134 aVal = _mm256_load_ps(aPtr);
135 bias = _mm256_set1_epi32(127);
136 leadingOne = _mm256_set1_ps(1.0f);
137 exp = _mm256_sub_epi32(
138 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
139 _mm256_set1_epi32(0x7f800000)),
142 bVal = _mm256_cvtepi32_ps(exp);
147 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
149#if LOG_POLY_DEGREE == 6
150 mantissa = POLY5_FMAAVX2(frac,
157#elif LOG_POLY_DEGREE == 5
158 mantissa = POLY4_FMAAVX2(frac,
159 2.8882704548164776201f,
160 -2.52074962577807006663f,
161 1.48116647521213171641f,
162 -0.465725644288844778798f,
163 0.0596515482674574969533f);
164#elif LOG_POLY_DEGREE == 4
165 mantissa = POLY3_FMAAVX2(frac,
166 2.61761038894603480148f,
167 -1.75647175389045657003f,
168 0.688243882994381274313f,
169 -0.107254423828329604454f);
170#elif LOG_POLY_DEGREE == 3
171 mantissa = POLY2_FMAAVX2(frac,
172 2.28330284476918490682f,
173 -1.04913055217340124191f,
174 0.204446009836232697516f);
179 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
180 _mm256_store_ps(bPtr, bVal);
186 number = eighthPoints * 8;
193#include <immintrin.h>
195#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
196#define POLY1_AVX2(x, c0, c1) \
197 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
198#define POLY2_AVX2(x, c0, c1, c2) \
199 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
200#define POLY3_AVX2(x, c0, c1, c2, c3) \
201 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
202#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
203 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
204#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
205 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
208volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
213 unsigned int number = 0;
214 const unsigned int eighthPoints = num_points / 8;
216 __m256 aVal, bVal, mantissa, frac, leadingOne;
219 for (; number < eighthPoints; number++) {
221 aVal = _mm256_load_ps(aPtr);
222 bias = _mm256_set1_epi32(127);
223 leadingOne = _mm256_set1_ps(1.0f);
224 exp = _mm256_sub_epi32(
225 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
226 _mm256_set1_epi32(0x7f800000)),
229 bVal = _mm256_cvtepi32_ps(exp);
234 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
236#if LOG_POLY_DEGREE == 6
237 mantissa = POLY5_AVX2(frac,
244#elif LOG_POLY_DEGREE == 5
245 mantissa = POLY4_AVX2(frac,
246 2.8882704548164776201f,
247 -2.52074962577807006663f,
248 1.48116647521213171641f,
249 -0.465725644288844778798f,
250 0.0596515482674574969533f);
251#elif LOG_POLY_DEGREE == 4
252 mantissa = POLY3_AVX2(frac,
253 2.61761038894603480148f,
254 -1.75647175389045657003f,
255 0.688243882994381274313f,
256 -0.107254423828329604454f);
257#elif LOG_POLY_DEGREE == 3
258 mantissa = POLY2_AVX2(frac,
259 2.28330284476918490682f,
260 -1.04913055217340124191f,
261 0.204446009836232697516f);
267 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
268 _mm256_store_ps(bPtr, bVal);
274 number = eighthPoints * 8;
281#include <smmintrin.h>
283#define POLY0(x, c0) _mm_set1_ps(c0)
284#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
285#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
286#define POLY3(x, c0, c1, c2, c3) \
287 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
288#define POLY4(x, c0, c1, c2, c3, c4) \
289 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
290#define POLY5(x, c0, c1, c2, c3, c4, c5) \
291 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
294volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
296 float* bPtr = bVector;
297 const float* aPtr = aVector;
299 unsigned int number = 0;
300 const unsigned int quarterPoints = num_points / 4;
302 __m128 aVal, bVal, mantissa, frac, leadingOne;
305 for (; number < quarterPoints; number++) {
307 aVal = _mm_load_ps(aPtr);
308 bias = _mm_set1_epi32(127);
309 leadingOne = _mm_set1_ps(1.0f);
312 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
314 bVal = _mm_cvtepi32_ps(exp);
317 frac = _mm_or_ps(leadingOne,
318 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
320#if LOG_POLY_DEGREE == 6
321 mantissa = POLY5(frac,
328#elif LOG_POLY_DEGREE == 5
329 mantissa = POLY4(frac,
330 2.8882704548164776201f,
331 -2.52074962577807006663f,
332 1.48116647521213171641f,
333 -0.465725644288844778798f,
334 0.0596515482674574969533f);
335#elif LOG_POLY_DEGREE == 4
336 mantissa = POLY3(frac,
337 2.61761038894603480148f,
338 -1.75647175389045657003f,
339 0.688243882994381274313f,
340 -0.107254423828329604454f);
341#elif LOG_POLY_DEGREE == 3
342 mantissa = POLY2(frac,
343 2.28330284476918490682f,
344 -1.04913055217340124191f,
345 0.204446009836232697516f);
350 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
351 _mm_store_ps(bPtr, bVal);
357 number = quarterPoints * 4;
367#define VLOG2Q_NEON_PREAMBLE() \
368 int32x4_t one = vdupq_n_s32(0x000800000); \
370 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
371 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
372 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
373 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
374 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
375 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
376 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
377 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
378 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
379 int32x4_t exp_bias = vdupq_n_s32(127);
382#define VLOG2Q_NEON_F32(log2_approx, aval) \
383 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
384 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
385 exponent_i = vshrq_n_s32(exponent_i, 23); \
390 significand_i = vorrq_s32(one, significand_i); \
391 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
393 exponent_i = vsubq_s32(exponent_i, exp_bias); \
394 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
398 log2_approx = vaddq_f32(exponent_f, p0); \
399 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
400 log2_approx = vaddq_f32(log2_approx, tmp1); \
401 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
402 tmp1 = vmulq_f32(sig_2, p2); \
403 log2_approx = vaddq_f32(log2_approx, tmp1); \
405 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
406 tmp1 = vmulq_f32(sig_3, p3); \
407 log2_approx = vaddq_f32(log2_approx, tmp1); \
408 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
409 tmp1 = vmulq_f32(sig_4, p4); \
410 log2_approx = vaddq_f32(log2_approx, tmp1); \
411 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
412 tmp1 = vmulq_f32(sig_5, p5); \
413 log2_approx = vaddq_f32(log2_approx, tmp1); \
414 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
415 tmp1 = vmulq_f32(sig_6, p6); \
416 log2_approx = vaddq_f32(log2_approx, tmp1);
421 float* bPtr = bVector;
422 const float* aPtr = aVector;
424 const unsigned int quarterPoints = num_points / 4;
427 float32x4_t log2_approx;
438 for (number = 0; number < quarterPoints; ++number) {
440 aval = vld1q_s32((
int*)aPtr);
444 vst1q_f32(bPtr, log2_approx);
450 number = quarterPoints * 4;
459#ifndef INCLUDED_volk_32f_log2_32f_u_H
460#define INCLUDED_volk_32f_log2_32f_u_H
464#include <smmintrin.h>
466#define POLY0(x, c0) _mm_set1_ps(c0)
467#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
468#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
469#define POLY3(x, c0, c1, c2, c3) \
470 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
471#define POLY4(x, c0, c1, c2, c3, c4) \
472 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
473#define POLY5(x, c0, c1, c2, c3, c4, c5) \
474 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
477volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
479 float* bPtr = bVector;
480 const float* aPtr = aVector;
482 unsigned int number = 0;
483 const unsigned int quarterPoints = num_points / 4;
485 __m128 aVal, bVal, mantissa, frac, leadingOne;
488 for (; number < quarterPoints; number++) {
490 aVal = _mm_loadu_ps(aPtr);
491 bias = _mm_set1_epi32(127);
492 leadingOne = _mm_set1_ps(1.0f);
495 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
497 bVal = _mm_cvtepi32_ps(exp);
500 frac = _mm_or_ps(leadingOne,
501 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
503#if LOG_POLY_DEGREE == 6
504 mantissa = POLY5(frac,
511#elif LOG_POLY_DEGREE == 5
512 mantissa = POLY4(frac,
513 2.8882704548164776201f,
514 -2.52074962577807006663f,
515 1.48116647521213171641f,
516 -0.465725644288844778798f,
517 0.0596515482674574969533f);
518#elif LOG_POLY_DEGREE == 4
519 mantissa = POLY3(frac,
520 2.61761038894603480148f,
521 -1.75647175389045657003f,
522 0.688243882994381274313f,
523 -0.107254423828329604454f);
524#elif LOG_POLY_DEGREE == 3
525 mantissa = POLY2(frac,
526 2.28330284476918490682f,
527 -1.04913055217340124191f,
528 0.204446009836232697516f);
533 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
534 _mm_storeu_ps(bPtr, bVal);
540 number = quarterPoints * 4;
546#if LV_HAVE_AVX2 && LV_HAVE_FMA
547#include <immintrin.h>
549#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
550#define POLY1_FMAAVX2(x, c0, c1) \
551 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
552#define POLY2_FMAAVX2(x, c0, c1, c2) \
553 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
554#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
555 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
556#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
557 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
558#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
559 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
561static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
562 const float* aVector,
563 unsigned int num_points)
565 float* bPtr = bVector;
566 const float* aPtr = aVector;
568 unsigned int number = 0;
569 const unsigned int eighthPoints = num_points / 8;
571 __m256 aVal, bVal, mantissa, frac, leadingOne;
574 for (; number < eighthPoints; number++) {
576 aVal = _mm256_loadu_ps(aPtr);
577 bias = _mm256_set1_epi32(127);
578 leadingOne = _mm256_set1_ps(1.0f);
579 exp = _mm256_sub_epi32(
580 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
581 _mm256_set1_epi32(0x7f800000)),
584 bVal = _mm256_cvtepi32_ps(exp);
589 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
591#if LOG_POLY_DEGREE == 6
592 mantissa = POLY5_FMAAVX2(frac,
599#elif LOG_POLY_DEGREE == 5
600 mantissa = POLY4_FMAAVX2(frac,
601 2.8882704548164776201f,
602 -2.52074962577807006663f,
603 1.48116647521213171641f,
604 -0.465725644288844778798f,
605 0.0596515482674574969533f);
606#elif LOG_POLY_DEGREE == 4
607 mantissa = POLY3_FMAAVX2(frac,
608 2.61761038894603480148f,
609 -1.75647175389045657003f,
610 0.688243882994381274313f,
611 -0.107254423828329604454f);
612#elif LOG_POLY_DEGREE == 3
613 mantissa = POLY2_FMAAVX2(frac,
614 2.28330284476918490682f,
615 -1.04913055217340124191f,
616 0.204446009836232697516f);
621 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
622 _mm256_storeu_ps(bPtr, bVal);
628 number = eighthPoints * 8;
635#include <immintrin.h>
637#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
638#define POLY1_AVX2(x, c0, c1) \
639 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
640#define POLY2_AVX2(x, c0, c1, c2) \
641 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
642#define POLY3_AVX2(x, c0, c1, c2, c3) \
643 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
644#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
645 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
646#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
647 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
650volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
652 float* bPtr = bVector;
653 const float* aPtr = aVector;
655 unsigned int number = 0;
656 const unsigned int eighthPoints = num_points / 8;
658 __m256 aVal, bVal, mantissa, frac, leadingOne;
661 for (; number < eighthPoints; number++) {
663 aVal = _mm256_loadu_ps(aPtr);
664 bias = _mm256_set1_epi32(127);
665 leadingOne = _mm256_set1_ps(1.0f);
666 exp = _mm256_sub_epi32(
667 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
668 _mm256_set1_epi32(0x7f800000)),
671 bVal = _mm256_cvtepi32_ps(exp);
676 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
678#if LOG_POLY_DEGREE == 6
679 mantissa = POLY5_AVX2(frac,
686#elif LOG_POLY_DEGREE == 5
687 mantissa = POLY4_AVX2(frac,
688 2.8882704548164776201f,
689 -2.52074962577807006663f,
690 1.48116647521213171641f,
691 -0.465725644288844778798f,
692 0.0596515482674574969533f);
693#elif LOG_POLY_DEGREE == 4
694 mantissa = POLY3_AVX2(frac,
695 2.61761038894603480148f,
696 -1.75647175389045657003f,
697 0.688243882994381274313f,
698 -0.107254423828329604454f);
699#elif LOG_POLY_DEGREE == 3
700 mantissa = POLY2_AVX2(frac,
701 2.28330284476918490682f,
702 -1.04913055217340124191f,
703 0.204446009836232697516f);
709 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
710 _mm256_storeu_ps(bPtr, bVal);
716 number = eighthPoints * 8;
723#include <riscv_vector.h>
726volk_32f_log2_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
728 size_t vlmax = __riscv_vsetvlmax_e32m2();
730#if LOG_POLY_DEGREE == 6
731 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
732 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
733 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
734 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
735 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
736 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
737#elif LOG_POLY_DEGREE == 5
738 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
739 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
740 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
741 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
742 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
743#elif LOG_POLY_DEGREE == 4
744 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
745 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
746 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
747 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
748#elif LOG_POLY_DEGREE == 3
749 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
750 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
751 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
756 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
757 const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
758 const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
759 const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
761 size_t n = num_points;
762 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
763 vl = __riscv_vsetvl_e32m2(n);
764 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
765 vfloat32m2_t a = __riscv_vfabs(v, vl);
766 vfloat32m2_t exp = __riscv_vfcvt_f(
767 __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
769 vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
770 __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
772 vfloat32m2_t mant = c0;
773 mant = __riscv_vfmadd(mant, frac, c1, vl);
774 mant = __riscv_vfmadd(mant, frac, c2, vl);
775#if LOG_POLY_DEGREE >= 4
776 mant = __riscv_vfmadd(mant, frac, c3, vl);
777#if LOG_POLY_DEGREE >= 5
778 mant = __riscv_vfmadd(mant, frac, c4, vl);
779#if LOG_POLY_DEGREE >= 6
780 mant = __riscv_vfmadd(mant, frac, c5, vl);
784 exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
786 __riscv_vse32(bVector, exp, vl);
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_log2_32f.h:416
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition volk_32f_log2_32f.h:382
#define VLOG2Q_NEON_PREAMBLE()
Definition volk_32f_log2_32f.h:367
static float log2f_non_ieee(float f)
Definition volk_common.h:155