52#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
53#define INCLUDED_volk_32f_invsqrt_32f_a_H
63 const float threehalfs = 1.5F;
71 u.i = 0x5f3759df - (u.i >> 1);
72 u.f = u.f * (threehalfs - (x2 * u.f * u.f));
85 unsigned int number = 0;
86 const unsigned int eighthPoints = num_points / 8;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
91 for (; number < eighthPoints; number++) {
92 aVal = _mm256_load_ps(aPtr);
93 cVal = _mm256_rsqrt_ps(aVal);
94 _mm256_store_ps(cPtr, cVal);
99 number = eighthPoints * 8;
100 for (; number < num_points; number++) {
108#include <xmmintrin.h>
113 unsigned int number = 0;
114 const unsigned int quarterPoints = num_points / 4;
116 float* cPtr = cVector;
117 const float* aPtr = aVector;
120 for (; number < quarterPoints; number++) {
122 aVal = _mm_load_ps(aPtr);
124 cVal = _mm_rsqrt_ps(aVal);
126 _mm_store_ps(cPtr, cVal);
132 number = quarterPoints * 4;
133 for (; number < num_points; number++) {
147 const unsigned int quarter_points = num_points / 4;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 float32x4_t a_val, c_val;
152 for (number = 0; number < quarter_points; ++number) {
153 a_val = vld1q_f32(aPtr);
154 c_val = vrsqrteq_f32(a_val);
155 vst1q_f32(cPtr, c_val);
160 for (number = quarter_points * 4; number < num_points; number++) {
167#ifdef LV_HAVE_GENERIC
170 const float* aVector,
171 unsigned int num_points)
173 float* cPtr = cVector;
174 const float* aPtr = aVector;
175 unsigned int number = 0;
176 for (number = 0; number < num_points; number++) {
183#include <immintrin.h>
188 unsigned int number = 0;
189 const unsigned int eighthPoints = num_points / 8;
191 float* cPtr = cVector;
192 const float* aPtr = aVector;
194 for (; number < eighthPoints; number++) {
195 aVal = _mm256_loadu_ps(aPtr);
196 cVal = _mm256_rsqrt_ps(aVal);
197 _mm256_storeu_ps(cPtr, cVal);
202 number = eighthPoints * 8;
203 for (; number < num_points; number++) {
210#include <riscv_vector.h>
213volk_32f_invsqrt_32f_rvv(
float* cVector,
const float* aVector,
unsigned int num_points)
215 size_t n = num_points;
216 for (
size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
217 vl = __riscv_vsetvl_e32m8(n);
218 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
219 __riscv_vse32(cVector, __riscv_vfrsqrt7(v, vl), vl);
static void volk_32f_invsqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:144
static void volk_32f_invsqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:83
static void volk_32f_invsqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:169
static void volk_32f_invsqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:111
static void volk_32f_invsqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition volk_32f_invsqrt_32f.h:186
static float Q_rsqrt(float number)
Definition volk_32f_invsqrt_32f.h:60
for i
Definition volk_config_fixed.tmpl.h:13