Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_avx2_fma_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/*
11 * This file is intended to hold AVX2 FMA intrinsics of intrinsics.
12 * They should be used in VOLK kernels to avoid copy-paste.
13 */
14
15#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17#include <immintrin.h>
18
19/*
20 * Approximate arctan(x) via polynomial expansion
21 * on the interval [-1, 1]
22 *
23 * Maximum relative error ~6.5e-7
24 * Polynomial evaluated via Horner's method
25 */
26static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
27{
28 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35
36 const __m256 x_times_x = _mm256_mul_ps(x, x);
37 __m256 arctan;
38 arctan = a13;
39 arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40 arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41 arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42 arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43 arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44 arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45 arctan = _mm256_mul_ps(x, arctan);
46
47 return arctan;
48}
49
50#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition volk_avx2_fma_intrinsics.h:26