Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_atan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
57#include <math.h>
58
59#ifndef INCLUDED_volk_32f_atan_32f_a_H
60#define INCLUDED_volk_32f_atan_32f_a_H
61
62#if LV_HAVE_AVX2 && LV_HAVE_FMA
63#include <immintrin.h>
65static inline void
66volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
67{
68 const __m256 one = _mm256_set1_ps(1.f);
69 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
70 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
71 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
72
73 unsigned int number = 0;
74 unsigned int eighth_points = num_points / 8;
75 for (; number < eighth_points; number++) {
76 __m256 x = _mm256_load_ps(in);
77 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
78 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
79 _mm256_blendv_ps(one, x, swap_mask));
80 __m256 result = _m256_arctan_poly_avx2_fma(x_star);
81 __m256 term = _mm256_and_ps(x_star, sign_mask);
82 term = _mm256_or_ps(pi_over_2, term);
83 term = _mm256_sub_ps(term, result);
84 result = _mm256_blendv_ps(result, term, swap_mask);
85 _mm256_store_ps(out, result);
86 in += 8;
87 out += 8;
88 }
89
90 number = eighth_points * 8;
91 for (; number < num_points; number++) {
92 *out++ = volk_arctan(*in++);
93 }
94}
95#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
96
97#if LV_HAVE_AVX
98#include <immintrin.h>
100static inline void
101volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
102{
103 const __m256 one = _mm256_set1_ps(1.f);
104 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
105 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
106 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
107
108 unsigned int number = 0;
109 unsigned int eighth_points = num_points / 8;
110 for (; number < eighth_points; number++) {
111 __m256 x = _mm256_load_ps(in);
112 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
113 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
114 _mm256_blendv_ps(one, x, swap_mask));
115 __m256 result = _m256_arctan_poly_avx(x_star);
116 __m256 term = _mm256_and_ps(x_star, sign_mask);
117 term = _mm256_or_ps(pi_over_2, term);
118 term = _mm256_sub_ps(term, result);
119 result = _mm256_blendv_ps(result, term, swap_mask);
120 _mm256_store_ps(out, result);
121 in += 8;
122 out += 8;
123 }
124
125 number = eighth_points * 8;
126 for (; number < num_points; number++) {
127 *out++ = volk_arctan(*in++);
128 }
129}
130#endif /* LV_HAVE_AVX for aligned */
131
132#ifdef LV_HAVE_SSE4_1
133#include <smmintrin.h>
135static inline void
136volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
137{
138 const __m128 one = _mm_set1_ps(1.f);
139 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
140 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
141 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
142
143 unsigned int number = 0;
144 unsigned int quarter_points = num_points / 4;
145 for (; number < quarter_points; number++) {
146 __m128 x = _mm_load_ps(in);
147 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
148 __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
149 _mm_blendv_ps(one, x, swap_mask));
150 __m128 result = _mm_arctan_poly_sse(x_star);
151 __m128 term = _mm_and_ps(x_star, sign_mask);
152 term = _mm_or_ps(pi_over_2, term);
153 term = _mm_sub_ps(term, result);
154 result = _mm_blendv_ps(result, term, swap_mask);
155 _mm_store_ps(out, result);
156 in += 4;
157 out += 4;
158 }
159
160 number = quarter_points * 4;
161 for (; number < num_points; number++) {
162 *out++ = volk_arctan(*in++);
163 }
164}
165#endif /* LV_HAVE_SSE4_1 for aligned */
166#endif /* INCLUDED_volk_32f_atan_32f_a_H */
167
168#ifndef INCLUDED_volk_32f_atan_32f_u_H
169#define INCLUDED_volk_32f_atan_32f_u_H
170
171#if LV_HAVE_AVX2 && LV_HAVE_FMA
172#include <immintrin.h>
173static inline void
174volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
175{
176 const __m256 one = _mm256_set1_ps(1.f);
177 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180
181 unsigned int number = 0;
182 unsigned int eighth_points = num_points / 8;
183 for (; number < eighth_points; number++) {
184 __m256 x = _mm256_loadu_ps(in);
185 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
186 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
187 _mm256_blendv_ps(one, x, swap_mask));
188 __m256 result = _m256_arctan_poly_avx2_fma(x_star);
189 __m256 term = _mm256_and_ps(x_star, sign_mask);
190 term = _mm256_or_ps(pi_over_2, term);
191 term = _mm256_sub_ps(term, result);
192 result = _mm256_blendv_ps(result, term, swap_mask);
193 _mm256_storeu_ps(out, result);
194 in += 8;
195 out += 8;
196 }
197
198 number = eighth_points * 8;
199 for (; number < num_points; number++) {
200 *out++ = volk_arctan(*in++);
201 }
202}
203#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
204
205#if LV_HAVE_AVX
206#include <immintrin.h>
207static inline void
208volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
209{
210 const __m256 one = _mm256_set1_ps(1.f);
211 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
212 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
213 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
214
215 unsigned int number = 0;
216 unsigned int eighth_points = num_points / 8;
217 for (; number < eighth_points; number++) {
218 __m256 x = _mm256_loadu_ps(in);
219 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
220 __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
221 _mm256_blendv_ps(one, x, swap_mask));
222 __m256 result = _m256_arctan_poly_avx(x_star);
223 __m256 term = _mm256_and_ps(x_star, sign_mask);
224 term = _mm256_or_ps(pi_over_2, term);
225 term = _mm256_sub_ps(term, result);
226 result = _mm256_blendv_ps(result, term, swap_mask);
227 _mm256_storeu_ps(out, result);
228 in += 8;
229 out += 8;
230 }
231
232 number = eighth_points * 8;
233 for (; number < num_points; number++) {
234 *out++ = volk_arctan(*in++);
235 }
236}
237#endif /* LV_HAVE_AVX for unaligned */
238
239#ifdef LV_HAVE_SSE4_1
240#include <smmintrin.h>
242static inline void
243volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
244{
245 const __m128 one = _mm_set1_ps(1.f);
246 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
247 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
248 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249
250 unsigned int number = 0;
251 unsigned int quarter_points = num_points / 4;
252 for (; number < quarter_points; number++) {
253 __m128 x = _mm_loadu_ps(in);
254 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
255 __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
256 _mm_blendv_ps(one, x, swap_mask));
257 __m128 result = _mm_arctan_poly_sse(x_star);
258 __m128 term = _mm_and_ps(x_star, sign_mask);
259 term = _mm_or_ps(pi_over_2, term);
260 term = _mm_sub_ps(term, result);
261 result = _mm_blendv_ps(result, term, swap_mask);
262 _mm_storeu_ps(out, result);
263 in += 4;
264 out += 4;
265 }
266
267 number = quarter_points * 4;
268 for (; number < num_points; number++) {
269 *out++ = volk_arctan(*in++);
270 }
271}
272#endif /* LV_HAVE_SSE4_1 for unaligned */
273
274#ifdef LV_HAVE_GENERIC
275static inline void
276volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
277{
278 unsigned int number = 0;
279 for (; number < num_points; number++) {
280 *out++ = volk_arctan(*in++);
281 }
282}
283#endif /* LV_HAVE_GENERIC */
284
285#ifdef LV_HAVE_GENERIC
286static inline void
287volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
288{
289 unsigned int number = 0;
290 for (; number < num_points; number++) {
291 *out++ = atanf(*in++);
292 }
293}
294#endif /* LV_HAVE_GENERIC */
295
296#ifdef LV_HAVE_RVV
297#include <riscv_vector.h>
298
299static inline void
300volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points)
301{
302 size_t vlmax = __riscv_vsetvlmax_e32m2();
303
304 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
305 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
306 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
307 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
308 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
309 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
310 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
311 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
312 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
313
314 size_t n = num_points;
315 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
316 vl = __riscv_vsetvl_e32m2(n);
317 vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl);
318 vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl);
319 vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl);
320 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
321 vfloat32m2_t p = c13;
322 p = __riscv_vfmadd(p, xx, c11, vl);
323 p = __riscv_vfmadd(p, xx, c9, vl);
324 p = __riscv_vfmadd(p, xx, c7, vl);
325 p = __riscv_vfmadd(p, xx, c5, vl);
326 p = __riscv_vfmadd(p, xx, c3, vl);
327 p = __riscv_vfmadd(p, xx, c1, vl);
328 p = __riscv_vfmul(p, x, vl);
329
330 vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
331 p = __riscv_vmerge(p, t, mswap, vl);
332
333 __riscv_vse32(out, p, vl);
334 }
335}
336#endif /*LV_HAVE_RVV*/
337
338#endif /* INCLUDED_volk_32f_atan_32f_u_H */
static void volk_32f_atan_32f_u_avx2(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:208
static void volk_32f_atan_32f_generic(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:287
static void volk_32f_atan_32f_polynomial(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:276
static void volk_32f_atan_32f_a_avx2(float *out, const float *in, unsigned int num_points)
Definition volk_32f_atan_32f.h:101
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition volk_avx2_fma_intrinsics.h:26
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition volk_avx_intrinsics.h:27
static float volk_arctan(const float x)
Definition volk_common.h:199
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition volk_sse_intrinsics.h:27