Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_magnitude_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
59#define INCLUDED_volk_32fc_magnitude_32f_u_H
60
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64
65#ifdef LV_HAVE_AVX
66#include <immintrin.h>
68
69static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
70 const lv_32fc_t* complexVector,
71 unsigned int num_points)
72{
73 unsigned int number = 0;
74 const unsigned int eighthPoints = num_points / 8;
75
76 const float* complexVectorPtr = (float*)complexVector;
77 float* magnitudeVectorPtr = magnitudeVector;
78
79 __m256 cplxValue1, cplxValue2, result;
80
81 for (; number < eighthPoints; number++) {
82 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
83 cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
84 result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
85 _mm256_storeu_ps(magnitudeVectorPtr, result);
86
87 complexVectorPtr += 16;
88 magnitudeVectorPtr += 8;
89 }
90
91 number = eighthPoints * 8;
92 for (; number < num_points; number++) {
93 float val1Real = *complexVectorPtr++;
94 float val1Imag = *complexVectorPtr++;
95 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
96 }
97}
98#endif /* LV_HAVE_AVX */
99
100#ifdef LV_HAVE_SSE3
101#include <pmmintrin.h>
103
104static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
105 const lv_32fc_t* complexVector,
106 unsigned int num_points)
107{
108 unsigned int number = 0;
109 const unsigned int quarterPoints = num_points / 4;
110
111 const float* complexVectorPtr = (float*)complexVector;
112 float* magnitudeVectorPtr = magnitudeVector;
113
114 __m128 cplxValue1, cplxValue2, result;
115 for (; number < quarterPoints; number++) {
116 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
117 complexVectorPtr += 4;
118
119 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
120 complexVectorPtr += 4;
121
122 result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
123
124 _mm_storeu_ps(magnitudeVectorPtr, result);
125 magnitudeVectorPtr += 4;
126 }
127
128 number = quarterPoints * 4;
129 for (; number < num_points; number++) {
130 float val1Real = *complexVectorPtr++;
131 float val1Imag = *complexVectorPtr++;
132 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
133 }
134}
135#endif /* LV_HAVE_SSE3 */
136
137
138#ifdef LV_HAVE_SSE
140#include <xmmintrin.h>
141
142static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
143 const lv_32fc_t* complexVector,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 const float* complexVectorPtr = (float*)complexVector;
150 float* magnitudeVectorPtr = magnitudeVector;
151
152 __m128 cplxValue1, cplxValue2, result;
153
154 for (; number < quarterPoints; number++) {
155 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
156 complexVectorPtr += 4;
157
158 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
159 complexVectorPtr += 4;
160
161 result = _mm_magnitude_ps(cplxValue1, cplxValue2);
162 _mm_storeu_ps(magnitudeVectorPtr, result);
163 magnitudeVectorPtr += 4;
164 }
165
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 float val1Real = *complexVectorPtr++;
169 float val1Imag = *complexVectorPtr++;
170 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
171 }
172}
173#endif /* LV_HAVE_SSE */
174
175
176#ifdef LV_HAVE_GENERIC
177
178static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
179 const lv_32fc_t* complexVector,
180 unsigned int num_points)
181{
182 const float* complexVectorPtr = (float*)complexVector;
183 float* magnitudeVectorPtr = magnitudeVector;
184 unsigned int number = 0;
185 for (number = 0; number < num_points; number++) {
186 const float real = *complexVectorPtr++;
187 const float imag = *complexVectorPtr++;
188 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
189 }
190}
191#endif /* LV_HAVE_GENERIC */
192
193
194#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
195#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
196#define INCLUDED_volk_32fc_magnitude_32f_a_H
197
198#include <inttypes.h>
199#include <math.h>
200#include <stdio.h>
201
202#ifdef LV_HAVE_AVX
203#include <immintrin.h>
205
206static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
207 const lv_32fc_t* complexVector,
208 unsigned int num_points)
209{
210 unsigned int number = 0;
211 const unsigned int eighthPoints = num_points / 8;
212
213 const float* complexVectorPtr = (float*)complexVector;
214 float* magnitudeVectorPtr = magnitudeVector;
215
216 __m256 cplxValue1, cplxValue2, result;
217 for (; number < eighthPoints; number++) {
218 cplxValue1 = _mm256_load_ps(complexVectorPtr);
219 complexVectorPtr += 8;
220
221 cplxValue2 = _mm256_load_ps(complexVectorPtr);
222 complexVectorPtr += 8;
223
224 result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
225 _mm256_store_ps(magnitudeVectorPtr, result);
226 magnitudeVectorPtr += 8;
227 }
228
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
231 float val1Real = *complexVectorPtr++;
232 float val1Imag = *complexVectorPtr++;
233 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
234 }
235}
236#endif /* LV_HAVE_AVX */
237
238#ifdef LV_HAVE_SSE3
239#include <pmmintrin.h>
241
242static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
243 const lv_32fc_t* complexVector,
244 unsigned int num_points)
245{
246 unsigned int number = 0;
247 const unsigned int quarterPoints = num_points / 4;
248
249 const float* complexVectorPtr = (float*)complexVector;
250 float* magnitudeVectorPtr = magnitudeVector;
251
252 __m128 cplxValue1, cplxValue2, result;
253 for (; number < quarterPoints; number++) {
254 cplxValue1 = _mm_load_ps(complexVectorPtr);
255 complexVectorPtr += 4;
256
257 cplxValue2 = _mm_load_ps(complexVectorPtr);
258 complexVectorPtr += 4;
259
260 result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
261 _mm_store_ps(magnitudeVectorPtr, result);
262 magnitudeVectorPtr += 4;
263 }
264
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 float val1Real = *complexVectorPtr++;
268 float val1Imag = *complexVectorPtr++;
269 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
270 }
271}
272#endif /* LV_HAVE_SSE3 */
273
274#ifdef LV_HAVE_SSE
276#include <xmmintrin.h>
277
278static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
279 const lv_32fc_t* complexVector,
280 unsigned int num_points)
281{
282 unsigned int number = 0;
283 const unsigned int quarterPoints = num_points / 4;
284
285 const float* complexVectorPtr = (float*)complexVector;
286 float* magnitudeVectorPtr = magnitudeVector;
287
288 __m128 cplxValue1, cplxValue2, result;
289 for (; number < quarterPoints; number++) {
290 cplxValue1 = _mm_load_ps(complexVectorPtr);
291 complexVectorPtr += 4;
292
293 cplxValue2 = _mm_load_ps(complexVectorPtr);
294 complexVectorPtr += 4;
295
296 result = _mm_magnitude_ps(cplxValue1, cplxValue2);
297 _mm_store_ps(magnitudeVectorPtr, result);
298 magnitudeVectorPtr += 4;
299 }
300
301 number = quarterPoints * 4;
302 for (; number < num_points; number++) {
303 float val1Real = *complexVectorPtr++;
304 float val1Imag = *complexVectorPtr++;
305 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
306 }
307}
308#endif /* LV_HAVE_SSE */
309
310
311#ifdef LV_HAVE_NEON
312#include <arm_neon.h>
313
314static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
315 const lv_32fc_t* complexVector,
316 unsigned int num_points)
317{
318 unsigned int number;
319 unsigned int quarter_points = num_points / 4;
320 const float* complexVectorPtr = (float*)complexVector;
321 float* magnitudeVectorPtr = magnitudeVector;
322
323 float32x4x2_t complex_vec;
324 float32x4_t magnitude_vec;
325 for (number = 0; number < quarter_points; number++) {
326 complex_vec = vld2q_f32(complexVectorPtr);
327 complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
328 magnitude_vec =
329 vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
330 magnitude_vec = vrsqrteq_f32(magnitude_vec);
331 magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
332 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
333
334 complexVectorPtr += 8;
335 magnitudeVectorPtr += 4;
336 }
337
338 for (number = quarter_points * 4; number < num_points; number++) {
339 const float real = *complexVectorPtr++;
340 const float imag = *complexVectorPtr++;
341 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
342 }
343}
344#endif /* LV_HAVE_NEON */
345
346
347#ifdef LV_HAVE_NEON
365 float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
366{
367 unsigned int number;
368 unsigned int quarter_points = num_points / 4;
369 const float* complexVectorPtr = (float*)complexVector;
370 float* magnitudeVectorPtr = magnitudeVector;
371
372 const float threshold = 0.4142135;
373
374 float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
375 a_high = vdupq_n_f32(0.84);
376 b_high = vdupq_n_f32(0.561);
377 a_low = vdupq_n_f32(0.99);
378 b_low = vdupq_n_f32(0.197);
379
380 uint32x4_t comp0, comp1;
381
382 float32x4x2_t complex_vec;
383 float32x4_t min_vec, max_vec, magnitude_vec;
384 float32x4_t real_abs, imag_abs;
385 for (number = 0; number < quarter_points; number++) {
386 complex_vec = vld2q_f32(complexVectorPtr);
387
388 real_abs = vabsq_f32(complex_vec.val[0]);
389 imag_abs = vabsq_f32(complex_vec.val[1]);
390
391 min_vec = vminq_f32(real_abs, imag_abs);
392 max_vec = vmaxq_f32(real_abs, imag_abs);
393
394 // effective branch to choose coefficient pair.
395 comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
396 comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
397
398 // and 0s or 1s with coefficients from previous effective branch
399 a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
400 vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
401 b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
402 vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
403
404 // coefficients chosen, do the weighted sum
405 min_vec = vmulq_f32(min_vec, b_vec);
406 max_vec = vmulq_f32(max_vec, a_vec);
407
408 magnitude_vec = vaddq_f32(min_vec, max_vec);
409 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
410
411 complexVectorPtr += 8;
412 magnitudeVectorPtr += 4;
413 }
414
415 for (number = quarter_points * 4; number < num_points; number++) {
416 const float real = *complexVectorPtr++;
417 const float imag = *complexVectorPtr++;
418 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
419 }
420}
421#endif /* LV_HAVE_NEON */
422
423#ifdef LV_HAVE_RVV
424#include <riscv_vector.h>
425
426static inline void volk_32fc_magnitude_32f_rvv(float* magnitudeVector,
427 const lv_32fc_t* complexVector,
428 unsigned int num_points)
429{
430 size_t n = num_points;
431 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
432 vl = __riscv_vsetvl_e32m4(n);
433 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
434 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
435 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
436 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
437 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
438 }
439}
440#endif /*LV_HAVE_RVV*/
441
442#ifdef LV_HAVE_RVVSEG
443#include <riscv_vector.h>
444
445static inline void volk_32fc_magnitude_32f_rvvseg(float* magnitudeVector,
446 const lv_32fc_t* complexVector,
447 unsigned int num_points)
448{
449 size_t n = num_points;
450 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
451 vl = __riscv_vsetvl_e32m4(n);
452 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
453 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
454 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
455 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
456 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
457 }
458}
459#endif /*LV_HAVE_RVVSEG*/
460
461#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
static void volk_32fc_magnitude_32f_u_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:142
static void volk_32fc_magnitude_32f_u_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:104
static void volk_32fc_magnitude_32f_u_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:69
static void volk_32fc_magnitude_32f_neon_fancy_sweet(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Calculates the magnitude of the complexVector and stores the results in the magnitudeVector.
Definition volk_32fc_magnitude_32f.h:364
static void volk_32fc_magnitude_32f_generic(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:178
static void volk_32fc_magnitude_32f_a_avx(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:206
static void volk_32fc_magnitude_32f_neon(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:314
static void volk_32fc_magnitude_32f_a_sse3(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:242
static void volk_32fc_magnitude_32f_a_sse(float *magnitudeVector, const lv_32fc_t *complexVector, unsigned int num_points)
Definition volk_32fc_magnitude_32f.h:278
static __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
Definition volk_avx_intrinsics.h:108
float complex lv_32fc_t
Definition volk_complex.h:74
static __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition volk_sse3_intrinsics.h:45
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition volk_sse_intrinsics.h:69