56#define Mln2 0.6931471805f
58#define B 1065353216.0f
62#ifndef INCLUDED_volk_32f_expfast_32f_a_H
63#define INCLUDED_volk_32f_expfast_32f_a_H
65#if LV_HAVE_AVX && LV_HAVE_FMA
69static inline void volk_32f_expfast_32f_a_avx_fma(
float* bVector,
71 unsigned int num_points)
73 float* bPtr = bVector;
74 const float* aPtr = aVector;
76 unsigned int number = 0;
77 const unsigned int eighthPoints = num_points / 8;
79 __m256 aVal, bVal, a, b;
81 a = _mm256_set1_ps(
A /
Mln2);
82 b = _mm256_set1_ps(
B -
C);
84 for (; number < eighthPoints; number++) {
85 aVal = _mm256_load_ps(aPtr);
86 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87 bVal = _mm256_castsi256_ps(exp);
89 _mm256_store_ps(bPtr, bVal);
94 number = eighthPoints * 8;
95 for (; number < num_points; number++) {
96 *bPtr++ = expf(*aPtr++);
104#include <immintrin.h>
109 float* bPtr = bVector;
110 const float* aPtr = aVector;
112 unsigned int number = 0;
113 const unsigned int eighthPoints = num_points / 8;
115 __m256 aVal, bVal, a, b;
117 a = _mm256_set1_ps(
A /
Mln2);
118 b = _mm256_set1_ps(
B -
C);
120 for (; number < eighthPoints; number++) {
121 aVal = _mm256_load_ps(aPtr);
122 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123 bVal = _mm256_castsi256_ps(exp);
125 _mm256_store_ps(bPtr, bVal);
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *bPtr++ = expf(*aPtr++);
139#include <smmintrin.h>
141static inline void volk_32f_expfast_32f_a_sse4_1(
float* bVector,
142 const float* aVector,
143 unsigned int num_points)
145 float* bPtr = bVector;
146 const float* aPtr = aVector;
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
151 __m128 aVal, bVal, a, b;
153 a = _mm_set1_ps(
A /
Mln2);
154 b = _mm_set1_ps(
B -
C);
156 for (; number < quarterPoints; number++) {
157 aVal = _mm_load_ps(aPtr);
158 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159 bVal = _mm_castsi128_ps(exp);
161 _mm_store_ps(bPtr, bVal);
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 *bPtr++ = expf(*aPtr++);
176#ifndef INCLUDED_volk_32f_expfast_32f_u_H
177#define INCLUDED_volk_32f_expfast_32f_u_H
179#if LV_HAVE_AVX && LV_HAVE_FMA
180#include <immintrin.h>
182static inline void volk_32f_expfast_32f_u_avx_fma(
float* bVector,
183 const float* aVector,
184 unsigned int num_points)
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
189 unsigned int number = 0;
190 const unsigned int eighthPoints = num_points / 8;
192 __m256 aVal, bVal, a, b;
194 a = _mm256_set1_ps(
A /
Mln2);
195 b = _mm256_set1_ps(
B -
C);
197 for (; number < eighthPoints; number++) {
198 aVal = _mm256_loadu_ps(aPtr);
199 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200 bVal = _mm256_castsi256_ps(exp);
202 _mm256_storeu_ps(bPtr, bVal);
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *bPtr++ = expf(*aPtr++);
216#include <immintrin.h>
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
224 unsigned int number = 0;
225 const unsigned int eighthPoints = num_points / 8;
227 __m256 aVal, bVal, a, b;
229 a = _mm256_set1_ps(
A /
Mln2);
230 b = _mm256_set1_ps(
B -
C);
232 for (; number < eighthPoints; number++) {
233 aVal = _mm256_loadu_ps(aPtr);
234 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235 bVal = _mm256_castsi256_ps(exp);
237 _mm256_storeu_ps(bPtr, bVal);
242 number = eighthPoints * 8;
243 for (; number < num_points; number++) {
244 *bPtr++ = expf(*aPtr++);
252#include <smmintrin.h>
254static inline void volk_32f_expfast_32f_u_sse4_1(
float* bVector,
255 const float* aVector,
256 unsigned int num_points)
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
264 __m128 aVal, bVal, a, b;
266 a = _mm_set1_ps(
A /
Mln2);
267 b = _mm_set1_ps(
B -
C);
269 for (; number < quarterPoints; number++) {
270 aVal = _mm_loadu_ps(aPtr);
271 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272 bVal = _mm_castsi128_ps(exp);
274 _mm_storeu_ps(bPtr, bVal);
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 *bPtr++ = expf(*aPtr++);
288#ifdef LV_HAVE_GENERIC
291 const float* aVector,
292 unsigned int num_points)
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
296 unsigned int number = 0;
298 for (number = 0; number < num_points; number++) {
299 *bPtr++ = expf(*aPtr++);
305#include <riscv_vector.h>
308volk_32f_expfast_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
310 size_t vlmax = __riscv_vsetvlmax_e32m8();
311 const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(
A /
Mln2, vlmax);
312 const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(
B -
C, vlmax);
314 size_t n = num_points;
315 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
316 vl = __riscv_vsetvl_e32m8(n);
317 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
318 v = __riscv_vfmadd(v, ca, cb, vl);
319 v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl));
320 __riscv_vse32(bVector, v, vl);
#define Mln2
Definition volk_32f_expfast_32f.h:56
#define B
Definition volk_32f_expfast_32f.h:58
#define A
Definition volk_32f_expfast_32f.h:57
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_expfast_32f.h:219
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_expfast_32f.h:290
#define C
Definition volk_32f_expfast_32f.h:59
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_expfast_32f.h:107