Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_sse_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold SSE intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18#include <xmmintrin.h>
19
20/*
21 * Approximate arctan(x) via polynomial expansion
22 * on the interval [-1, 1]
23 *
24 * Maximum relative error ~6.5e-7
25 * Polynomial evaluated via Horner's method
26 */
27static inline __m128 _mm_arctan_poly_sse(const __m128 x)
28{
29 const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
30 const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
31 const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
32 const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
33 const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
34 const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
35 const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
36
37 const __m128 x_times_x = _mm_mul_ps(x, x);
38 __m128 arctan;
39 arctan = a13;
40 arctan = _mm_mul_ps(x_times_x, arctan);
41 arctan = _mm_add_ps(arctan, a11);
42 arctan = _mm_mul_ps(x_times_x, arctan);
43 arctan = _mm_add_ps(arctan, a9);
44 arctan = _mm_mul_ps(x_times_x, arctan);
45 arctan = _mm_add_ps(arctan, a7);
46 arctan = _mm_mul_ps(x_times_x, arctan);
47 arctan = _mm_add_ps(arctan, a5);
48 arctan = _mm_mul_ps(x_times_x, arctan);
49 arctan = _mm_add_ps(arctan, a3);
50 arctan = _mm_mul_ps(x_times_x, arctan);
51 arctan = _mm_add_ps(arctan, a1);
52 arctan = _mm_mul_ps(x, arctan);
53
54 return arctan;
55}
56
57static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
58{
59 __m128 iValue, qValue;
60 // Arrange in i1i2i3i4 format
61 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
62 // Arrange in q1q2q3q4 format
63 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
64 iValue = _mm_mul_ps(iValue, iValue); // Square the I values
65 qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
66 return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
67}
68
69static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
70{
71 return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
72}
73
74static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
75 const __m128 symbols1,
76 const __m128 points0,
77 const __m128 points1,
78 const __m128 scalar)
79{
80 // calculate scalar * |x - y|^2
81 const __m128 diff0 = _mm_sub_ps(symbols0, points0);
82 const __m128 diff1 = _mm_sub_ps(symbols1, points1);
83 const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
84 return _mm_mul_ps(norms, scalar);
85}
86
87static inline __m128 _mm_accumulate_square_sum_ps(
88 __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
89{
90 aux = _mm_mul_ps(aux, val);
91 aux = _mm_sub_ps(aux, acc);
92 aux = _mm_mul_ps(aux, aux);
93 aux = _mm_mul_ps(aux, rec);
94 return _mm_add_ps(sq_acc, aux);
95}
96
97#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
static __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition volk_sse_intrinsics.h:57
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition volk_sse_intrinsics.h:27
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition volk_sse_intrinsics.h:87
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition volk_sse_intrinsics.h:74
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition volk_sse_intrinsics.h:69