86#ifndef INCLUDED_volk_32f_exp_32f_a_H
87#define INCLUDED_volk_32f_exp_32f_a_H
95 float* bPtr = bVector;
96 const float* aPtr = aVector;
98 unsigned int number = 0;
99 unsigned int quarterPoints = num_points / 4;
102 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105 __m128i emm0, pi32_0x7f;
107 one = _mm_set1_ps(1.0);
108 exp_hi = _mm_set1_ps(88.3762626647949);
109 exp_lo = _mm_set1_ps(-88.3762626647949);
110 log2EF = _mm_set1_ps(1.44269504088896341);
111 half = _mm_set1_ps(0.5);
112 exp_C1 = _mm_set1_ps(0.693359375);
113 exp_C2 = _mm_set1_ps(-2.12194440e-4);
114 pi32_0x7f = _mm_set1_epi32(0x7f);
116 exp_p0 = _mm_set1_ps(1.9875691500e-4);
117 exp_p1 = _mm_set1_ps(1.3981999507e-3);
118 exp_p2 = _mm_set1_ps(8.3334519073e-3);
119 exp_p3 = _mm_set1_ps(4.1665795894e-2);
120 exp_p4 = _mm_set1_ps(1.6666665459e-1);
121 exp_p5 = _mm_set1_ps(5.0000001201e-1);
123 for (; number < quarterPoints; number++) {
124 aVal = _mm_load_ps(aPtr);
125 tmp = _mm_setzero_ps();
127 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
130 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
132 emm0 = _mm_cvttps_epi32(fx);
133 tmp = _mm_cvtepi32_ps(emm0);
135 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136 fx = _mm_sub_ps(tmp, mask);
138 tmp = _mm_mul_ps(fx, exp_C1);
139 z = _mm_mul_ps(fx, exp_C2);
140 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141 z = _mm_mul_ps(aVal, aVal);
143 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147 y = _mm_add_ps(y, one);
149 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
151 pow2n = _mm_castsi128_ps(emm0);
152 bVal = _mm_mul_ps(y, pow2n);
154 _mm_store_ps(bPtr, bVal);
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *bPtr++ = expf(*aPtr++);
170#ifndef INCLUDED_volk_32f_exp_32f_u_H
171#define INCLUDED_volk_32f_exp_32f_u_H
174#include <emmintrin.h>
179 float* bPtr = bVector;
180 const float* aPtr = aVector;
182 unsigned int number = 0;
183 unsigned int quarterPoints = num_points / 4;
186 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189 __m128i emm0, pi32_0x7f;
191 one = _mm_set1_ps(1.0);
192 exp_hi = _mm_set1_ps(88.3762626647949);
193 exp_lo = _mm_set1_ps(-88.3762626647949);
194 log2EF = _mm_set1_ps(1.44269504088896341);
195 half = _mm_set1_ps(0.5);
196 exp_C1 = _mm_set1_ps(0.693359375);
197 exp_C2 = _mm_set1_ps(-2.12194440e-4);
198 pi32_0x7f = _mm_set1_epi32(0x7f);
200 exp_p0 = _mm_set1_ps(1.9875691500e-4);
201 exp_p1 = _mm_set1_ps(1.3981999507e-3);
202 exp_p2 = _mm_set1_ps(8.3334519073e-3);
203 exp_p3 = _mm_set1_ps(4.1665795894e-2);
204 exp_p4 = _mm_set1_ps(1.6666665459e-1);
205 exp_p5 = _mm_set1_ps(5.0000001201e-1);
208 for (; number < quarterPoints; number++) {
209 aVal = _mm_loadu_ps(aPtr);
210 tmp = _mm_setzero_ps();
212 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
215 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
217 emm0 = _mm_cvttps_epi32(fx);
218 tmp = _mm_cvtepi32_ps(emm0);
220 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221 fx = _mm_sub_ps(tmp, mask);
223 tmp = _mm_mul_ps(fx, exp_C1);
224 z = _mm_mul_ps(fx, exp_C2);
225 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226 z = _mm_mul_ps(aVal, aVal);
228 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232 y = _mm_add_ps(y, one);
234 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
236 pow2n = _mm_castsi128_ps(emm0);
237 bVal = _mm_mul_ps(y, pow2n);
239 _mm_storeu_ps(bPtr, bVal);
244 number = quarterPoints * 4;
245 for (; number < num_points; number++) {
246 *bPtr++ = expf(*aPtr++);
253#ifdef LV_HAVE_GENERIC
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260 unsigned int number = 0;
262 for (number = 0; number < num_points; number++) {
263 *bPtr++ = expf(*aPtr++);
270#include <riscv_vector.h>
273volk_32f_exp_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
275 size_t vlmax = __riscv_vsetvlmax_e32m2();
277 const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax);
278 const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax);
279 const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax);
280 const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax);
281 const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax);
282 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
283 const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax);
285 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax);
286 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax);
287 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax);
288 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax);
289 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax);
290 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax);
292 size_t n = num_points;
293 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
294 vl = __riscv_vsetvl_e32m2(n);
295 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
296 v = __riscv_vfmin(v, exp_hi, vl);
297 v = __riscv_vfmax(v, exp_lo, vl);
298 vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl);
300 vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
301 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
302 v = __riscv_vfmacc(v, fx, exp_C1, vl);
303 v = __riscv_vfmacc(v, fx, exp_C2, vl);
304 vfloat32m2_t vv = __riscv_vfmul(v, v, vl);
307 y = __riscv_vfmadd(y, v, c1, vl);
308 y = __riscv_vfmadd(y, v, c2, vl);
309 y = __riscv_vfmadd(y, v, c3, vl);
310 y = __riscv_vfmadd(y, v, c4, vl);
311 y = __riscv_vfmadd(y, v, c5, vl);
312 y = __riscv_vfmadd(y, vv, v, vl);
313 y = __riscv_vfadd(y, cf1, vl);
315 vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2(
316 __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl));
318 __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl);
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:93
static void volk_32f_exp_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:256
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_exp_32f.h:177