Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_log2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
78
79#ifndef INCLUDED_volk_32f_log2_32f_a_H
80#define INCLUDED_volk_32f_log2_32f_a_H
81
82#include <inttypes.h>
83#include <math.h>
84#include <stdio.h>
85#include <stdlib.h>
86
87#define LOG_POLY_DEGREE 6
88
89#ifdef LV_HAVE_GENERIC
90
91static inline void
92volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
93{
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
97
98 for (number = 0; number < num_points; number++) {
99 *bPtr++ = log2f_non_ieee(*aPtr++);
100 }
101}
102#endif /* LV_HAVE_GENERIC */
103
104#if LV_HAVE_AVX2 && LV_HAVE_FMA
105#include <immintrin.h>
106
107#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
108#define POLY1_FMAAVX2(x, c0, c1) \
109 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
110#define POLY2_FMAAVX2(x, c0, c1, c2) \
111 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
112#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
113 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
114#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
115 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
116#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
117 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118
119static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
120 const float* aVector,
121 unsigned int num_points)
122{
123 float* bPtr = bVector;
124 const float* aPtr = aVector;
125
126 unsigned int number = 0;
127 const unsigned int eighthPoints = num_points / 8;
128
129 __m256 aVal, bVal, mantissa, frac, leadingOne;
130 __m256i bias, exp;
131
132 for (; number < eighthPoints; number++) {
133
134 aVal = _mm256_load_ps(aPtr);
135 bias = _mm256_set1_epi32(127);
136 leadingOne = _mm256_set1_ps(1.0f);
137 exp = _mm256_sub_epi32(
138 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
139 _mm256_set1_epi32(0x7f800000)),
140 23),
141 bias);
142 bVal = _mm256_cvtepi32_ps(exp);
143
144 // Now to extract mantissa
145 frac = _mm256_or_ps(
146 leadingOne,
147 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148
149#if LOG_POLY_DEGREE == 6
150 mantissa = POLY5_FMAAVX2(frac,
151 3.1157899f,
152 -3.3241990f,
153 2.5988452f,
154 -1.2315303f,
155 3.1821337e-1f,
156 -3.4436006e-2f);
157#elif LOG_POLY_DEGREE == 5
158 mantissa = POLY4_FMAAVX2(frac,
159 2.8882704548164776201f,
160 -2.52074962577807006663f,
161 1.48116647521213171641f,
162 -0.465725644288844778798f,
163 0.0596515482674574969533f);
164#elif LOG_POLY_DEGREE == 4
165 mantissa = POLY3_FMAAVX2(frac,
166 2.61761038894603480148f,
167 -1.75647175389045657003f,
168 0.688243882994381274313f,
169 -0.107254423828329604454f);
170#elif LOG_POLY_DEGREE == 3
171 mantissa = POLY2_FMAAVX2(frac,
172 2.28330284476918490682f,
173 -1.04913055217340124191f,
174 0.204446009836232697516f);
175#else
176#error
177#endif
178
179 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
180 _mm256_store_ps(bPtr, bVal);
181
182 aPtr += 8;
183 bPtr += 8;
184 }
185
186 number = eighthPoints * 8;
187 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
188}
189
190#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
191
192#ifdef LV_HAVE_AVX2
193#include <immintrin.h>
194
195#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
196#define POLY1_AVX2(x, c0, c1) \
197 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
198#define POLY2_AVX2(x, c0, c1, c2) \
199 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
200#define POLY3_AVX2(x, c0, c1, c2, c3) \
201 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
202#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
203 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
204#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
205 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
206
207static inline void
208volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
209{
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
212
213 unsigned int number = 0;
214 const unsigned int eighthPoints = num_points / 8;
215
216 __m256 aVal, bVal, mantissa, frac, leadingOne;
217 __m256i bias, exp;
218
219 for (; number < eighthPoints; number++) {
220
221 aVal = _mm256_load_ps(aPtr);
222 bias = _mm256_set1_epi32(127);
223 leadingOne = _mm256_set1_ps(1.0f);
224 exp = _mm256_sub_epi32(
225 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
226 _mm256_set1_epi32(0x7f800000)),
227 23),
228 bias);
229 bVal = _mm256_cvtepi32_ps(exp);
230
231 // Now to extract mantissa
232 frac = _mm256_or_ps(
233 leadingOne,
234 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235
236#if LOG_POLY_DEGREE == 6
237 mantissa = POLY5_AVX2(frac,
238 3.1157899f,
239 -3.3241990f,
240 2.5988452f,
241 -1.2315303f,
242 3.1821337e-1f,
243 -3.4436006e-2f);
244#elif LOG_POLY_DEGREE == 5
245 mantissa = POLY4_AVX2(frac,
246 2.8882704548164776201f,
247 -2.52074962577807006663f,
248 1.48116647521213171641f,
249 -0.465725644288844778798f,
250 0.0596515482674574969533f);
251#elif LOG_POLY_DEGREE == 4
252 mantissa = POLY3_AVX2(frac,
253 2.61761038894603480148f,
254 -1.75647175389045657003f,
255 0.688243882994381274313f,
256 -0.107254423828329604454f);
257#elif LOG_POLY_DEGREE == 3
258 mantissa = POLY2_AVX2(frac,
259 2.28330284476918490682f,
260 -1.04913055217340124191f,
261 0.204446009836232697516f);
262#else
263#error
264#endif
265
266 bVal =
267 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
268 _mm256_store_ps(bPtr, bVal);
269
270 aPtr += 8;
271 bPtr += 8;
272 }
273
274 number = eighthPoints * 8;
275 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
276}
277
278#endif /* LV_HAVE_AVX2 for aligned */
279
280#ifdef LV_HAVE_SSE4_1
281#include <smmintrin.h>
282
283#define POLY0(x, c0) _mm_set1_ps(c0)
284#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
285#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
286#define POLY3(x, c0, c1, c2, c3) \
287 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
288#define POLY4(x, c0, c1, c2, c3, c4) \
289 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
290#define POLY5(x, c0, c1, c2, c3, c4, c5) \
291 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
292
293static inline void
294volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
295{
296 float* bPtr = bVector;
297 const float* aPtr = aVector;
298
299 unsigned int number = 0;
300 const unsigned int quarterPoints = num_points / 4;
301
302 __m128 aVal, bVal, mantissa, frac, leadingOne;
303 __m128i bias, exp;
304
305 for (; number < quarterPoints; number++) {
306
307 aVal = _mm_load_ps(aPtr);
308 bias = _mm_set1_epi32(127);
309 leadingOne = _mm_set1_ps(1.0f);
310 exp = _mm_sub_epi32(
311 _mm_srli_epi32(
312 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
313 bias);
314 bVal = _mm_cvtepi32_ps(exp);
315
316 // Now to extract mantissa
317 frac = _mm_or_ps(leadingOne,
318 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
319
320#if LOG_POLY_DEGREE == 6
321 mantissa = POLY5(frac,
322 3.1157899f,
323 -3.3241990f,
324 2.5988452f,
325 -1.2315303f,
326 3.1821337e-1f,
327 -3.4436006e-2f);
328#elif LOG_POLY_DEGREE == 5
329 mantissa = POLY4(frac,
330 2.8882704548164776201f,
331 -2.52074962577807006663f,
332 1.48116647521213171641f,
333 -0.465725644288844778798f,
334 0.0596515482674574969533f);
335#elif LOG_POLY_DEGREE == 4
336 mantissa = POLY3(frac,
337 2.61761038894603480148f,
338 -1.75647175389045657003f,
339 0.688243882994381274313f,
340 -0.107254423828329604454f);
341#elif LOG_POLY_DEGREE == 3
342 mantissa = POLY2(frac,
343 2.28330284476918490682f,
344 -1.04913055217340124191f,
345 0.204446009836232697516f);
346#else
347#error
348#endif
349
350 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
351 _mm_store_ps(bPtr, bVal);
352
353 aPtr += 4;
354 bPtr += 4;
355 }
356
357 number = quarterPoints * 4;
358 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
359}
360
361#endif /* LV_HAVE_SSE4_1 for aligned */
362
363#ifdef LV_HAVE_NEON
364#include <arm_neon.h>
365
366/* these macros allow us to embed logs in other kernels */
367#define VLOG2Q_NEON_PREAMBLE() \
368 int32x4_t one = vdupq_n_s32(0x000800000); \
369 /* minimax polynomial */ \
370 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
371 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
372 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
373 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
374 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
375 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
376 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
377 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
378 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
379 int32x4_t exp_bias = vdupq_n_s32(127);
380
381
382#define VLOG2Q_NEON_F32(log2_approx, aval) \
383 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
384 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
385 exponent_i = vshrq_n_s32(exponent_i, 23); \
386 \
387 /* extract the exponent and significand \
388 we can treat this as fixed point to save ~9% on the \
389 conversion + float add */ \
390 significand_i = vorrq_s32(one, significand_i); \
391 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392 /* debias the exponent and convert to float */ \
393 exponent_i = vsubq_s32(exponent_i, exp_bias); \
394 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
395 \
396 /* put the significand through a polynomial fit of log2(x) [1,2] \
397 add the result to the exponent */ \
398 log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
399 float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
400 log2_approx = vaddq_f32(log2_approx, tmp1); \
401 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
402 tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
403 log2_approx = vaddq_f32(log2_approx, tmp1); \
404 \
405 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
406 tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
407 log2_approx = vaddq_f32(log2_approx, tmp1); \
408 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
409 tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
410 log2_approx = vaddq_f32(log2_approx, tmp1); \
411 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
412 tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
413 log2_approx = vaddq_f32(log2_approx, tmp1); \
414 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
415 tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
416 log2_approx = vaddq_f32(log2_approx, tmp1);
417
418static inline void
419volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
420{
421 float* bPtr = bVector;
422 const float* aPtr = aVector;
423 unsigned int number;
424 const unsigned int quarterPoints = num_points / 4;
425
426 int32x4_t aval;
427 float32x4_t log2_approx;
428
430 // lms
431 // p0 = vdupq_n_f32(-1.649132280361871);
432 // p1 = vdupq_n_f32(1.995047138579499);
433 // p2 = vdupq_n_f32(-0.336914839219728);
434
435 // keep in mind a single precision float is represented as
436 // (-1)^sign * 2^exp * 1.significand, so the log2 is
437 // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
438 for (number = 0; number < quarterPoints; ++number) {
439 // load float in to an int register without conversion
440 aval = vld1q_s32((int*)aPtr);
441
442 VLOG2Q_NEON_F32(log2_approx, aval)
443
444 vst1q_f32(bPtr, log2_approx);
445
446 aPtr += 4;
447 bPtr += 4;
448 }
449
450 number = quarterPoints * 4;
451 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
452}
453
454#endif /* LV_HAVE_NEON */
455
456
457#endif /* INCLUDED_volk_32f_log2_32f_a_H */
458
459#ifndef INCLUDED_volk_32f_log2_32f_u_H
460#define INCLUDED_volk_32f_log2_32f_u_H
461
462
463#ifdef LV_HAVE_SSE4_1
464#include <smmintrin.h>
465
466#define POLY0(x, c0) _mm_set1_ps(c0)
467#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
468#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
469#define POLY3(x, c0, c1, c2, c3) \
470 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
471#define POLY4(x, c0, c1, c2, c3, c4) \
472 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
473#define POLY5(x, c0, c1, c2, c3, c4, c5) \
474 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
475
476static inline void
477volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
478{
479 float* bPtr = bVector;
480 const float* aPtr = aVector;
481
482 unsigned int number = 0;
483 const unsigned int quarterPoints = num_points / 4;
484
485 __m128 aVal, bVal, mantissa, frac, leadingOne;
486 __m128i bias, exp;
487
488 for (; number < quarterPoints; number++) {
489
490 aVal = _mm_loadu_ps(aPtr);
491 bias = _mm_set1_epi32(127);
492 leadingOne = _mm_set1_ps(1.0f);
493 exp = _mm_sub_epi32(
494 _mm_srli_epi32(
495 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
496 bias);
497 bVal = _mm_cvtepi32_ps(exp);
498
499 // Now to extract mantissa
500 frac = _mm_or_ps(leadingOne,
501 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
502
503#if LOG_POLY_DEGREE == 6
504 mantissa = POLY5(frac,
505 3.1157899f,
506 -3.3241990f,
507 2.5988452f,
508 -1.2315303f,
509 3.1821337e-1f,
510 -3.4436006e-2f);
511#elif LOG_POLY_DEGREE == 5
512 mantissa = POLY4(frac,
513 2.8882704548164776201f,
514 -2.52074962577807006663f,
515 1.48116647521213171641f,
516 -0.465725644288844778798f,
517 0.0596515482674574969533f);
518#elif LOG_POLY_DEGREE == 4
519 mantissa = POLY3(frac,
520 2.61761038894603480148f,
521 -1.75647175389045657003f,
522 0.688243882994381274313f,
523 -0.107254423828329604454f);
524#elif LOG_POLY_DEGREE == 3
525 mantissa = POLY2(frac,
526 2.28330284476918490682f,
527 -1.04913055217340124191f,
528 0.204446009836232697516f);
529#else
530#error
531#endif
532
533 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
534 _mm_storeu_ps(bPtr, bVal);
535
536 aPtr += 4;
537 bPtr += 4;
538 }
539
540 number = quarterPoints * 4;
541 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
542}
543
544#endif /* LV_HAVE_SSE4_1 for unaligned */
545
546#if LV_HAVE_AVX2 && LV_HAVE_FMA
547#include <immintrin.h>
548
549#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
550#define POLY1_FMAAVX2(x, c0, c1) \
551 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
552#define POLY2_FMAAVX2(x, c0, c1, c2) \
553 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
554#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
555 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
556#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
557 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
558#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
559 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
560
561static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
562 const float* aVector,
563 unsigned int num_points)
564{
565 float* bPtr = bVector;
566 const float* aPtr = aVector;
567
568 unsigned int number = 0;
569 const unsigned int eighthPoints = num_points / 8;
570
571 __m256 aVal, bVal, mantissa, frac, leadingOne;
572 __m256i bias, exp;
573
574 for (; number < eighthPoints; number++) {
575
576 aVal = _mm256_loadu_ps(aPtr);
577 bias = _mm256_set1_epi32(127);
578 leadingOne = _mm256_set1_ps(1.0f);
579 exp = _mm256_sub_epi32(
580 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
581 _mm256_set1_epi32(0x7f800000)),
582 23),
583 bias);
584 bVal = _mm256_cvtepi32_ps(exp);
585
586 // Now to extract mantissa
587 frac = _mm256_or_ps(
588 leadingOne,
589 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
590
591#if LOG_POLY_DEGREE == 6
592 mantissa = POLY5_FMAAVX2(frac,
593 3.1157899f,
594 -3.3241990f,
595 2.5988452f,
596 -1.2315303f,
597 3.1821337e-1f,
598 -3.4436006e-2f);
599#elif LOG_POLY_DEGREE == 5
600 mantissa = POLY4_FMAAVX2(frac,
601 2.8882704548164776201f,
602 -2.52074962577807006663f,
603 1.48116647521213171641f,
604 -0.465725644288844778798f,
605 0.0596515482674574969533f);
606#elif LOG_POLY_DEGREE == 4
607 mantissa = POLY3_FMAAVX2(frac,
608 2.61761038894603480148f,
609 -1.75647175389045657003f,
610 0.688243882994381274313f,
611 -0.107254423828329604454f);
612#elif LOG_POLY_DEGREE == 3
613 mantissa = POLY2_FMAAVX2(frac,
614 2.28330284476918490682f,
615 -1.04913055217340124191f,
616 0.204446009836232697516f);
617#else
618#error
619#endif
620
621 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
622 _mm256_storeu_ps(bPtr, bVal);
623
624 aPtr += 8;
625 bPtr += 8;
626 }
627
628 number = eighthPoints * 8;
629 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
630}
631
632#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
633
634#ifdef LV_HAVE_AVX2
635#include <immintrin.h>
636
637#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
638#define POLY1_AVX2(x, c0, c1) \
639 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
640#define POLY2_AVX2(x, c0, c1, c2) \
641 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
642#define POLY3_AVX2(x, c0, c1, c2, c3) \
643 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
644#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
645 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
646#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
647 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
648
649static inline void
650volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
651{
652 float* bPtr = bVector;
653 const float* aPtr = aVector;
654
655 unsigned int number = 0;
656 const unsigned int eighthPoints = num_points / 8;
657
658 __m256 aVal, bVal, mantissa, frac, leadingOne;
659 __m256i bias, exp;
660
661 for (; number < eighthPoints; number++) {
662
663 aVal = _mm256_loadu_ps(aPtr);
664 bias = _mm256_set1_epi32(127);
665 leadingOne = _mm256_set1_ps(1.0f);
666 exp = _mm256_sub_epi32(
667 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
668 _mm256_set1_epi32(0x7f800000)),
669 23),
670 bias);
671 bVal = _mm256_cvtepi32_ps(exp);
672
673 // Now to extract mantissa
674 frac = _mm256_or_ps(
675 leadingOne,
676 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
677
678#if LOG_POLY_DEGREE == 6
679 mantissa = POLY5_AVX2(frac,
680 3.1157899f,
681 -3.3241990f,
682 2.5988452f,
683 -1.2315303f,
684 3.1821337e-1f,
685 -3.4436006e-2f);
686#elif LOG_POLY_DEGREE == 5
687 mantissa = POLY4_AVX2(frac,
688 2.8882704548164776201f,
689 -2.52074962577807006663f,
690 1.48116647521213171641f,
691 -0.465725644288844778798f,
692 0.0596515482674574969533f);
693#elif LOG_POLY_DEGREE == 4
694 mantissa = POLY3_AVX2(frac,
695 2.61761038894603480148f,
696 -1.75647175389045657003f,
697 0.688243882994381274313f,
698 -0.107254423828329604454f);
699#elif LOG_POLY_DEGREE == 3
700 mantissa = POLY2_AVX2(frac,
701 2.28330284476918490682f,
702 -1.04913055217340124191f,
703 0.204446009836232697516f);
704#else
705#error
706#endif
707
708 bVal =
709 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
710 _mm256_storeu_ps(bPtr, bVal);
711
712 aPtr += 8;
713 bPtr += 8;
714 }
715
716 number = eighthPoints * 8;
717 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
718}
719
720#endif /* LV_HAVE_AVX2 for unaligned */
721
722#ifdef LV_HAVE_RVV
723#include <riscv_vector.h>
724
725static inline void
726volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
727{
728 size_t vlmax = __riscv_vsetvlmax_e32m2();
729
730#if LOG_POLY_DEGREE == 6
731 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3.1157899f, vlmax);
732 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(-3.3241990f, vlmax);
733 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.5988452f, vlmax);
734 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.2315303f, vlmax);
735 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(3.1821337e-1f, vlmax);
736 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-3.4436006e-2f, vlmax);
737#elif LOG_POLY_DEGREE == 5
738 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(2.8882704548164776201f, vlmax);
739 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-2.52074962577807006663f, vlmax);
740 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(1.48116647521213171641f, vlmax);
741 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-0.465725644288844778798f, vlmax);
742 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.0596515482674574969533f, vlmax);
743#elif LOG_POLY_DEGREE == 4
744 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(2.61761038894603480148f, vlmax);
745 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(-1.75647175389045657003f, vlmax);
746 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(0.688243882994381274313f, vlmax);
747 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(-0.107254423828329604454f, vlmax);
748#elif LOG_POLY_DEGREE == 3
749 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(2.28330284476918490682f, vlmax);
750 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(-1.04913055217340124191f, vlmax);
751 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(0.204446009836232697516f, vlmax);
752#else
753#error
754#endif
755
756 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
757 const vint32m2_t m1 = __riscv_vreinterpret_i32m2(cf1);
758 const vint32m2_t m2 = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
759 const vint32m2_t c127 = __riscv_vmv_v_x_i32m2(127, vlmax);
760
761 size_t n = num_points;
762 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
763 vl = __riscv_vsetvl_e32m2(n);
764 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
765 vfloat32m2_t a = __riscv_vfabs(v, vl);
766 vfloat32m2_t exp = __riscv_vfcvt_f(
767 __riscv_vsub(__riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), c127, vl),
768 vl);
769 vfloat32m2_t frac = __riscv_vreinterpret_f32m2(
770 __riscv_vor(__riscv_vand(__riscv_vreinterpret_i32m2(v), m2, vl), m1, vl));
771
772 vfloat32m2_t mant = c0;
773 mant = __riscv_vfmadd(mant, frac, c1, vl);
774 mant = __riscv_vfmadd(mant, frac, c2, vl);
775#if LOG_POLY_DEGREE >= 4
776 mant = __riscv_vfmadd(mant, frac, c3, vl);
777#if LOG_POLY_DEGREE >= 5
778 mant = __riscv_vfmadd(mant, frac, c4, vl);
779#if LOG_POLY_DEGREE >= 6
780 mant = __riscv_vfmadd(mant, frac, c5, vl);
781#endif
782#endif
783#endif
784 exp = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
785
786 __riscv_vse32(bVector, exp, vl);
787 }
788}
789#endif /*LV_HAVE_RVV*/
790
791#endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_log2_32f.h:416
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition volk_32f_log2_32f.h:382
#define VLOG2Q_NEON_PREAMBLE()
Definition volk_32f_log2_32f.h:367
static float log2f_non_ieee(float f)
Definition volk_common.h:155