Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_s32f_magnitude_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
41
42#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
43#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
44
45#include <inttypes.h>
46#include <math.h>
47#include <stdio.h>
48#include <volk/volk_common.h>
49
50#ifdef LV_HAVE_AVX2
51#include <immintrin.h>
52
53static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
54 const lv_16sc_t* complexVector,
55 const float scalar,
56 unsigned int num_points)
57{
58 unsigned int number = 0;
59 const unsigned int eighthPoints = num_points / 8;
60
61 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
62 float* magnitudeVectorPtr = magnitudeVector;
63
64 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
65
66 __m256 cplxValue1, cplxValue2, result;
67 __m256i int1, int2;
68 __m128i short1, short2;
69 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
70
71 for (; number < eighthPoints; number++) {
72
73 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
74 complexVectorPtr += 16;
75 short1 = _mm256_extracti128_si256(int1, 0);
76 short2 = _mm256_extracti128_si256(int1, 1);
77
78 int1 = _mm256_cvtepi16_epi32(short1);
79 int2 = _mm256_cvtepi16_epi32(short2);
80 cplxValue1 = _mm256_cvtepi32_ps(int1);
81 cplxValue2 = _mm256_cvtepi32_ps(int2);
82
83 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
84 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
85
86 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
87 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
88
89 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
90 result = _mm256_permutevar8x32_ps(result, idx);
91
92 result = _mm256_sqrt_ps(result); // Square root the values
93
94 _mm256_store_ps(magnitudeVectorPtr, result);
95
96 magnitudeVectorPtr += 8;
97 }
98
99 number = eighthPoints * 8;
100 magnitudeVectorPtr = &magnitudeVector[number];
101 complexVectorPtr = (const int16_t*)&complexVector[number];
102 for (; number < num_points; number++) {
103 float val1Real = (float)(*complexVectorPtr++) / scalar;
104 float val1Imag = (float)(*complexVectorPtr++) / scalar;
105 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
106 }
107}
108#endif /* LV_HAVE_AVX2 */
109
110
111#ifdef LV_HAVE_SSE3
112#include <pmmintrin.h>
113
114static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
115 const lv_16sc_t* complexVector,
116 const float scalar,
117 unsigned int num_points)
118{
119 unsigned int number = 0;
120 const unsigned int quarterPoints = num_points / 4;
121
122 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
123 float* magnitudeVectorPtr = magnitudeVector;
124
125 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
126
127 __m128 cplxValue1, cplxValue2, result;
128
129 __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
130
131 for (; number < quarterPoints; number++) {
132
133 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
134 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
135 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
136 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
137
138 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
139 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
140 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
141 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
142
143 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
144 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
145
146 complexVectorPtr += 8;
147
148 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
149 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
150
151 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
152 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
153
154 result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
155
156 result = _mm_sqrt_ps(result); // Square root the values
157
158 _mm_store_ps(magnitudeVectorPtr, result);
159
160 magnitudeVectorPtr += 4;
161 }
162
163 number = quarterPoints * 4;
164 magnitudeVectorPtr = &magnitudeVector[number];
165 complexVectorPtr = (const int16_t*)&complexVector[number];
166 for (; number < num_points; number++) {
167 float val1Real = (float)(*complexVectorPtr++) / scalar;
168 float val1Imag = (float)(*complexVectorPtr++) / scalar;
169 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
170 }
171}
172#endif /* LV_HAVE_SSE3 */
173
174#ifdef LV_HAVE_SSE
175#include <xmmintrin.h>
176
177static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
178 const lv_16sc_t* complexVector,
179 const float scalar,
180 unsigned int num_points)
181{
182 unsigned int number = 0;
183 const unsigned int quarterPoints = num_points / 4;
184
185 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
186 float* magnitudeVectorPtr = magnitudeVector;
187
188 const float iScalar = 1.0 / scalar;
189 __m128 invScalar = _mm_set_ps1(iScalar);
190
191 __m128 cplxValue1, cplxValue2, result, re, im;
192
193 __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
194
195 for (; number < quarterPoints; number++) {
196 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
197 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
198 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
199 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
200
201 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
202 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
203 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
204 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
205
206 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
207 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
208
209 re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
210 im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
211
212 complexVectorPtr += 8;
213
214 cplxValue1 = _mm_mul_ps(re, invScalar);
215 cplxValue2 = _mm_mul_ps(im, invScalar);
216
217 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
218 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
219
220 result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
221
222 result = _mm_sqrt_ps(result); // Square root the values
223
224 _mm_store_ps(magnitudeVectorPtr, result);
225
226 magnitudeVectorPtr += 4;
227 }
228
229 number = quarterPoints * 4;
230 magnitudeVectorPtr = &magnitudeVector[number];
231 complexVectorPtr = (const int16_t*)&complexVector[number];
232 for (; number < num_points; number++) {
233 float val1Real = (float)(*complexVectorPtr++) * iScalar;
234 float val1Imag = (float)(*complexVectorPtr++) * iScalar;
235 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
236 }
237}
238
239
240#endif /* LV_HAVE_SSE */
241
242#ifdef LV_HAVE_GENERIC
243
244static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
245 const lv_16sc_t* complexVector,
246 const float scalar,
247 unsigned int num_points)
248{
249 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
250 float* magnitudeVectorPtr = magnitudeVector;
251 unsigned int number = 0;
252 const float invScalar = 1.0 / scalar;
253 for (number = 0; number < num_points; number++) {
254 float real = ((float)(*complexVectorPtr++)) * invScalar;
255 float imag = ((float)(*complexVectorPtr++)) * invScalar;
256 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
257 }
258}
259#endif /* LV_HAVE_GENERIC */
260
261
262#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
263
264#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
265#define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
266
267#include <inttypes.h>
268#include <math.h>
269#include <stdio.h>
270#include <volk/volk_common.h>
271
272#ifdef LV_HAVE_AVX2
273#include <immintrin.h>
274
275static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
276 const lv_16sc_t* complexVector,
277 const float scalar,
278 unsigned int num_points)
279{
280 unsigned int number = 0;
281 const unsigned int eighthPoints = num_points / 8;
282
283 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
284 float* magnitudeVectorPtr = magnitudeVector;
285
286 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
287
288 __m256 cplxValue1, cplxValue2, result;
289 __m256i int1, int2;
290 __m128i short1, short2;
291 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
292
293 for (; number < eighthPoints; number++) {
294
295 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
296 complexVectorPtr += 16;
297 short1 = _mm256_extracti128_si256(int1, 0);
298 short2 = _mm256_extracti128_si256(int1, 1);
299
300 int1 = _mm256_cvtepi16_epi32(short1);
301 int2 = _mm256_cvtepi16_epi32(short2);
302 cplxValue1 = _mm256_cvtepi32_ps(int1);
303 cplxValue2 = _mm256_cvtepi32_ps(int2);
304
305 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
306 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
307
308 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
309 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
310
311 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
312 result = _mm256_permutevar8x32_ps(result, idx);
313
314 result = _mm256_sqrt_ps(result); // Square root the values
315
316 _mm256_storeu_ps(magnitudeVectorPtr, result);
317
318 magnitudeVectorPtr += 8;
319 }
320
321 number = eighthPoints * 8;
322 magnitudeVectorPtr = &magnitudeVector[number];
323 complexVectorPtr = (const int16_t*)&complexVector[number];
324 for (; number < num_points; number++) {
325 float val1Real = (float)(*complexVectorPtr++) / scalar;
326 float val1Imag = (float)(*complexVectorPtr++) / scalar;
327 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
328 }
329}
330#endif /* LV_HAVE_AVX2 */
331
332#ifdef LV_HAVE_RVV
333#include <riscv_vector.h>
334
335static inline void volk_16ic_s32f_magnitude_32f_rvv(float* magnitudeVector,
336 const lv_16sc_t* complexVector,
337 const float scalar,
338 unsigned int num_points)
339{
340 size_t n = num_points;
341 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
342 vl = __riscv_vsetvl_e16m4(n);
343 vint32m8_t vc = __riscv_vle32_v_i32m8((const int32_t*)complexVector, vl);
344 vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
345 vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
346 vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
347 vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
348 vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
349 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
350 }
351}
352#endif /*LV_HAVE_RVV*/
353
354#ifdef LV_HAVE_RVVSEG
355#include <riscv_vector.h>
356
357static inline void volk_16ic_s32f_magnitude_32f_rvvseg(float* magnitudeVector,
358 const lv_16sc_t* complexVector,
359 const float scalar,
360 unsigned int num_points)
361{
362 size_t n = num_points;
363 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
364 vl = __riscv_vsetvl_e16m4(n);
365 vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((const int16_t*)complexVector, vl);
366 vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
367 vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
368 vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
369 vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
370 vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
371 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
372 }
373}
374#endif /*LV_HAVE_RVVSEG*/
375
376#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
static void volk_16ic_s32f_magnitude_32f_generic(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition volk_16ic_s32f_magnitude_32f.h:244
static void volk_16ic_s32f_magnitude_32f_a_sse(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition volk_16ic_s32f_magnitude_32f.h:177
static void volk_16ic_s32f_magnitude_32f_a_sse3(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition volk_16ic_s32f_magnitude_32f.h:114
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
short complex lv_16sc_t
Definition volk_complex.h:71