Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32fc_x2_rotator2_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
67
68#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
70
71
72#include <math.h>
73#include <stdio.h>
74#include <stdlib.h>
75#include <volk/volk_complex.h>
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79
80
81#ifdef LV_HAVE_GENERIC
82
84 const lv_32fc_t* inVector,
85 const lv_32fc_t* phase_inc,
86 lv_32fc_t* phase,
87 unsigned int num_points)
88{
89 unsigned int i = 0;
90 int j = 0;
91 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
92 for (j = 0; j < ROTATOR_RELOAD; ++j) {
93 *outVector++ = *inVector++ * (*phase);
94 (*phase) *= *phase_inc;
95 }
96
97 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
98 }
99 for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
100 *outVector++ = *inVector++ * (*phase);
101 (*phase) *= *phase_inc;
102 }
103 if (i) {
104 // Make sure, we normalize phase on every call!
105 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106 }
107}
108
109#endif /* LV_HAVE_GENERIC */
110
111
112#ifdef LV_HAVE_NEON
113#include <arm_neon.h>
115
117 const lv_32fc_t* inVector,
118 const lv_32fc_t* phase_inc,
119 lv_32fc_t* phase,
120 unsigned int num_points)
121
122{
123 lv_32fc_t* outputVectorPtr = outVector;
124 const lv_32fc_t* inputVectorPtr = inVector;
125 lv_32fc_t incr = 1;
126 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127 float32x4x2_t input_vec;
128 float32x4x2_t output_vec;
129
130 unsigned int i = 0, j = 0;
131 // const unsigned int quarter_points = num_points / 4;
132
133 for (i = 0; i < 4; ++i) {
134 phasePtr[i] *= incr;
135 incr *= (*phase_inc);
136 }
137
138 // Notice that incr has be incremented in the previous loop
139 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140 const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
141 float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
142
143 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
144 for (j = 0; j < ROTATOR_RELOAD_4; j++) {
145 input_vec = vld2q_f32((float*)inputVectorPtr);
146 // Prefetch next one, speeds things up
147 __VOLK_PREFETCH(inputVectorPtr + 4);
148 // Rotate
149 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
150 // Increase phase
151 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
152 // Store output
153 vst2q_f32((float*)outputVectorPtr, output_vec);
154
155 outputVectorPtr += 4;
156 inputVectorPtr += 4;
157 }
158 // normalize phase so magnitude doesn't grow because of
159 // floating point rounding error
160 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
161 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
162 // Multiply complex with real
163 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
165 }
166
167 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
168 input_vec = vld2q_f32((float*)inputVectorPtr);
169 // Prefetch next one, speeds things up
170 __VOLK_PREFETCH(inputVectorPtr + 4);
171 // Rotate
172 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
173 // Increase phase
174 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
175 // Store output
176 vst2q_f32((float*)outputVectorPtr, output_vec);
177
178 outputVectorPtr += 4;
179 inputVectorPtr += 4;
180 }
181 // if(i) == true means we looped above
182 if (i) {
183 // normalize phase so magnitude doesn't grow because of
184 // floating point rounding error
185 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
186 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
187 // Multiply complex with real
188 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
190 }
191 // Store current phase
192 vst2q_f32((float*)phasePtr, phase_vec);
193
194 // Deal with the rest
195 for (i = 0; i < num_points % 4; i++) {
196 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197 phasePtr[0] *= (*phase_inc);
198 }
199
200 // For continuous phase next time we need to call this function
201 (*phase) = phasePtr[0];
202}
203
204#endif /* LV_HAVE_NEON */
205
206
207#ifdef LV_HAVE_SSE4_1
208#include <smmintrin.h>
209
210static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(lv_32fc_t* outVector,
211 const lv_32fc_t* inVector,
212 const lv_32fc_t* phase_inc,
213 lv_32fc_t* phase,
214 unsigned int num_points)
215{
216 lv_32fc_t* cPtr = outVector;
217 const lv_32fc_t* aPtr = inVector;
218 lv_32fc_t incr = 1;
219 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
220
221 unsigned int i, j = 0;
222
223 for (i = 0; i < 2; ++i) {
224 phase_Ptr[i] *= incr;
225 incr *= (*phase_inc);
226 }
227
228 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229
230 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
232
233 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
234 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
235
236 aVal = _mm_load_ps((float*)aPtr);
237
238 yl = _mm_moveldup_ps(phase_Val);
239 yh = _mm_movehdup_ps(phase_Val);
240 ylp = _mm_moveldup_ps(inc_Val);
241 yhp = _mm_movehdup_ps(inc_Val);
242
243 tmp1 = _mm_mul_ps(aVal, yl);
244 tmp1p = _mm_mul_ps(phase_Val, ylp);
245
246 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
247 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
248 tmp2 = _mm_mul_ps(aVal, yh);
249 tmp2p = _mm_mul_ps(phase_Val, yhp);
250
251 z = _mm_addsub_ps(tmp1, tmp2);
252 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
253
254 _mm_store_ps((float*)cPtr, z);
255
256 aPtr += 2;
257 cPtr += 2;
258 }
259 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
260 tmp2 = _mm_hadd_ps(tmp1, tmp1);
261 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
262 tmp2 = _mm_sqrt_ps(tmp1);
263 phase_Val = _mm_div_ps(phase_Val, tmp2);
264 }
265 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
266 aVal = _mm_load_ps((float*)aPtr);
267
268 yl = _mm_moveldup_ps(phase_Val);
269 yh = _mm_movehdup_ps(phase_Val);
270 ylp = _mm_moveldup_ps(inc_Val);
271 yhp = _mm_movehdup_ps(inc_Val);
272
273 tmp1 = _mm_mul_ps(aVal, yl);
274
275 tmp1p = _mm_mul_ps(phase_Val, ylp);
276
277 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
278 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
279 tmp2 = _mm_mul_ps(aVal, yh);
280 tmp2p = _mm_mul_ps(phase_Val, yhp);
281
282 z = _mm_addsub_ps(tmp1, tmp2);
283 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
284
285 _mm_store_ps((float*)cPtr, z);
286
287 aPtr += 2;
288 cPtr += 2;
289 }
290 if (i) {
291 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
292 tmp2 = _mm_hadd_ps(tmp1, tmp1);
293 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
294 tmp2 = _mm_sqrt_ps(tmp1);
295 phase_Val = _mm_div_ps(phase_Val, tmp2);
296 }
297
298 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
299 if (num_points & 1) {
300 *cPtr++ = *aPtr++ * phase_Ptr[0];
301 phase_Ptr[0] *= (*phase_inc);
302 }
303
304 (*phase) = phase_Ptr[0];
305}
306
307#endif /* LV_HAVE_SSE4_1 for aligned */
308
309
310#ifdef LV_HAVE_SSE4_1
311#include <smmintrin.h>
312
313static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(lv_32fc_t* outVector,
314 const lv_32fc_t* inVector,
315 const lv_32fc_t* phase_inc,
316 lv_32fc_t* phase,
317 unsigned int num_points)
318{
319 lv_32fc_t* cPtr = outVector;
320 const lv_32fc_t* aPtr = inVector;
321 lv_32fc_t incr = 1;
322 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
323
324 unsigned int i, j = 0;
325
326 for (i = 0; i < 2; ++i) {
327 phase_Ptr[i] *= incr;
328 incr *= (*phase_inc);
329 }
330
331 /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
332 printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
333 printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
334 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335
336 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
337 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
338
339 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
340 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
341
342 aVal = _mm_loadu_ps((float*)aPtr);
343
344 yl = _mm_moveldup_ps(phase_Val);
345 yh = _mm_movehdup_ps(phase_Val);
346 ylp = _mm_moveldup_ps(inc_Val);
347 yhp = _mm_movehdup_ps(inc_Val);
348
349 tmp1 = _mm_mul_ps(aVal, yl);
350 tmp1p = _mm_mul_ps(phase_Val, ylp);
351
352 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
353 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
354 tmp2 = _mm_mul_ps(aVal, yh);
355 tmp2p = _mm_mul_ps(phase_Val, yhp);
356
357 z = _mm_addsub_ps(tmp1, tmp2);
358 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
359
360 _mm_storeu_ps((float*)cPtr, z);
361
362 aPtr += 2;
363 cPtr += 2;
364 }
365 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
366 tmp2 = _mm_hadd_ps(tmp1, tmp1);
367 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
368 tmp2 = _mm_sqrt_ps(tmp1);
369 phase_Val = _mm_div_ps(phase_Val, tmp2);
370 }
371 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
372 aVal = _mm_loadu_ps((float*)aPtr);
373
374 yl = _mm_moveldup_ps(phase_Val);
375 yh = _mm_movehdup_ps(phase_Val);
376 ylp = _mm_moveldup_ps(inc_Val);
377 yhp = _mm_movehdup_ps(inc_Val);
378
379 tmp1 = _mm_mul_ps(aVal, yl);
380
381 tmp1p = _mm_mul_ps(phase_Val, ylp);
382
383 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
384 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
385 tmp2 = _mm_mul_ps(aVal, yh);
386 tmp2p = _mm_mul_ps(phase_Val, yhp);
387
388 z = _mm_addsub_ps(tmp1, tmp2);
389 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
390
391 _mm_storeu_ps((float*)cPtr, z);
392
393 aPtr += 2;
394 cPtr += 2;
395 }
396 if (i) {
397 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
398 tmp2 = _mm_hadd_ps(tmp1, tmp1);
399 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
400 tmp2 = _mm_sqrt_ps(tmp1);
401 phase_Val = _mm_div_ps(phase_Val, tmp2);
402 }
403
404 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
405 if (num_points & 1) {
406 *cPtr++ = *aPtr++ * phase_Ptr[0];
407 phase_Ptr[0] *= (*phase_inc);
408 }
409
410 (*phase) = phase_Ptr[0];
411}
412
413#endif /* LV_HAVE_SSE4_1 */
414
415
416#ifdef LV_HAVE_AVX
417#include <immintrin.h>
419
421 const lv_32fc_t* inVector,
422 const lv_32fc_t* phase_inc,
423 lv_32fc_t* phase,
424 unsigned int num_points)
425{
426 lv_32fc_t* cPtr = outVector;
427 const lv_32fc_t* aPtr = inVector;
428 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
429 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
430
431 unsigned int i, j = 0;
432
433 for (i = 0; i < 4; ++i) {
434 phase_Ptr[i] *= incr;
435 incr *= (*phase_inc);
436 }
437
438 __m256 aVal, phase_Val, z;
439
440 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
441
442 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
443 lv_creal(incr),
444 lv_cimag(incr),
445 lv_creal(incr),
446 lv_cimag(incr),
447 lv_creal(incr),
448 lv_cimag(incr),
449 lv_creal(incr));
450
451 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
452 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
453
454 aVal = _mm256_load_ps((float*)aPtr);
455
456 z = _mm256_complexmul_ps(aVal, phase_Val);
457 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
458
459 _mm256_store_ps((float*)cPtr, z);
460
461 aPtr += 4;
462 cPtr += 4;
463 }
464 phase_Val = _mm256_normalize_ps(phase_Val);
465 }
466
467 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
468 aVal = _mm256_load_ps((float*)aPtr);
469
470 z = _mm256_complexmul_ps(aVal, phase_Val);
471 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
472
473 _mm256_store_ps((float*)cPtr, z);
474
475 aPtr += 4;
476 cPtr += 4;
477 }
478 if (i) {
479 phase_Val = _mm256_normalize_ps(phase_Val);
480 }
481
482 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
483 (*phase) = phase_Ptr[0];
485 cPtr, aPtr, phase_inc, phase, num_points % 4);
486}
487
488#endif /* LV_HAVE_AVX for aligned */
489
490
491#ifdef LV_HAVE_AVX
492#include <immintrin.h>
494
496 const lv_32fc_t* inVector,
497 const lv_32fc_t* phase_inc,
498 lv_32fc_t* phase,
499 unsigned int num_points)
500{
501 lv_32fc_t* cPtr = outVector;
502 const lv_32fc_t* aPtr = inVector;
503 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
504 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
505
506 unsigned int i, j = 0;
507
508 for (i = 0; i < 4; ++i) {
509 phase_Ptr[i] *= incr;
510 incr *= (*phase_inc);
511 }
512
513 __m256 aVal, phase_Val, z;
514
515 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
516
517 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
518 lv_creal(incr),
519 lv_cimag(incr),
520 lv_creal(incr),
521 lv_cimag(incr),
522 lv_creal(incr),
523 lv_cimag(incr),
524 lv_creal(incr));
525
526 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
527 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
528
529 aVal = _mm256_loadu_ps((float*)aPtr);
530
531 z = _mm256_complexmul_ps(aVal, phase_Val);
532 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
533
534 _mm256_storeu_ps((float*)cPtr, z);
535
536 aPtr += 4;
537 cPtr += 4;
538 }
539 phase_Val = _mm256_normalize_ps(phase_Val);
540 }
541
542 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
543 aVal = _mm256_loadu_ps((float*)aPtr);
544
545 z = _mm256_complexmul_ps(aVal, phase_Val);
546 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
547
548 _mm256_storeu_ps((float*)cPtr, z);
549
550 aPtr += 4;
551 cPtr += 4;
552 }
553 if (i) {
554 phase_Val = _mm256_normalize_ps(phase_Val);
555 }
556
557 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
558 (*phase) = phase_Ptr[0];
560 cPtr, aPtr, phase_inc, phase, num_points % 4);
561}
562
563#endif /* LV_HAVE_AVX */
564
565#if LV_HAVE_AVX && LV_HAVE_FMA
566#include <immintrin.h>
567
568static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(lv_32fc_t* outVector,
569 const lv_32fc_t* inVector,
570 const lv_32fc_t* phase_inc,
571 lv_32fc_t* phase,
572 unsigned int num_points)
573{
574 lv_32fc_t* cPtr = outVector;
575 const lv_32fc_t* aPtr = inVector;
576 lv_32fc_t incr = 1;
578 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
579
580 unsigned int i, j = 0;
581
582 for (i = 0; i < 4; ++i) {
583 phase_Ptr[i] *= incr;
584 incr *= (*phase_inc);
585 }
586
587 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
588
589 phase_Val = _mm256_load_ps((float*)phase_Ptr);
590 inc_Val = _mm256_set_ps(lv_cimag(incr),
591 lv_creal(incr),
592 lv_cimag(incr),
593 lv_creal(incr),
594 lv_cimag(incr),
595 lv_creal(incr),
596 lv_cimag(incr),
597 lv_creal(incr));
598
599 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
600 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
601
602 aVal = _mm256_load_ps((float*)aPtr);
603
604 yl = _mm256_moveldup_ps(phase_Val);
605 yh = _mm256_movehdup_ps(phase_Val);
606 ylp = _mm256_moveldup_ps(inc_Val);
607 yhp = _mm256_movehdup_ps(inc_Val);
608
609 tmp1 = aVal;
610 tmp1p = phase_Val;
611
612 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
613 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
614 tmp2 = _mm256_mul_ps(aVal, yh);
615 tmp2p = _mm256_mul_ps(phase_Val, yhp);
616
617 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
618 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
619
620 _mm256_store_ps((float*)cPtr, z);
621
622 aPtr += 4;
623 cPtr += 4;
624 }
625 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
626 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
627 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
628 tmp2 = _mm256_sqrt_ps(tmp1);
629 phase_Val = _mm256_div_ps(phase_Val, tmp2);
630 }
631 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
632 aVal = _mm256_load_ps((float*)aPtr);
633
634 yl = _mm256_moveldup_ps(phase_Val);
635 yh = _mm256_movehdup_ps(phase_Val);
636 ylp = _mm256_moveldup_ps(inc_Val);
637 yhp = _mm256_movehdup_ps(inc_Val);
638
639 tmp1 = aVal;
640 tmp1p = phase_Val;
641
642 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
643 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
644 tmp2 = _mm256_mul_ps(aVal, yh);
645 tmp2p = _mm256_mul_ps(phase_Val, yhp);
646
647 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
648 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
649
650 _mm256_store_ps((float*)cPtr, z);
651
652 aPtr += 4;
653 cPtr += 4;
654 }
655 if (i) {
656 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
657 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
658 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
659 tmp2 = _mm256_sqrt_ps(tmp1);
660 phase_Val = _mm256_div_ps(phase_Val, tmp2);
661 }
662
663 _mm256_store_ps((float*)phase_Ptr, phase_Val);
664 for (i = 0; i < num_points % 4; ++i) {
665 *cPtr++ = *aPtr++ * phase_Ptr[0];
666 phase_Ptr[0] *= (*phase_inc);
667 }
668
669 (*phase) = phase_Ptr[0];
670}
671
672#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
673
674#if LV_HAVE_AVX && LV_HAVE_FMA
675#include <immintrin.h>
676
677static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVector,
678 const lv_32fc_t* inVector,
679 const lv_32fc_t* phase_inc,
680 lv_32fc_t* phase,
681 unsigned int num_points)
682{
683 lv_32fc_t* cPtr = outVector;
684 const lv_32fc_t* aPtr = inVector;
685 lv_32fc_t incr = 1;
686 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
687
688 unsigned int i, j = 0;
689
690 for (i = 0; i < 4; ++i) {
691 phase_Ptr[i] *= incr;
692 incr *= (*phase_inc);
693 }
694
695 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
696
697 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
698 inc_Val = _mm256_set_ps(lv_cimag(incr),
699 lv_creal(incr),
700 lv_cimag(incr),
701 lv_creal(incr),
702 lv_cimag(incr),
703 lv_creal(incr),
704 lv_cimag(incr),
705 lv_creal(incr));
706
707 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
708 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
709
710 aVal = _mm256_loadu_ps((float*)aPtr);
711
712 yl = _mm256_moveldup_ps(phase_Val);
713 yh = _mm256_movehdup_ps(phase_Val);
714 ylp = _mm256_moveldup_ps(inc_Val);
715 yhp = _mm256_movehdup_ps(inc_Val);
716
717 tmp1 = aVal;
718 tmp1p = phase_Val;
719
720 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
721 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
722 tmp2 = _mm256_mul_ps(aVal, yh);
723 tmp2p = _mm256_mul_ps(phase_Val, yhp);
724
725 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
726 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
727
728 _mm256_storeu_ps((float*)cPtr, z);
729
730 aPtr += 4;
731 cPtr += 4;
732 }
733 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
734 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
735 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
736 tmp2 = _mm256_sqrt_ps(tmp1);
737 phase_Val = _mm256_div_ps(phase_Val, tmp2);
738 }
739 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
740 aVal = _mm256_loadu_ps((float*)aPtr);
741
742 yl = _mm256_moveldup_ps(phase_Val);
743 yh = _mm256_movehdup_ps(phase_Val);
744 ylp = _mm256_moveldup_ps(inc_Val);
745 yhp = _mm256_movehdup_ps(inc_Val);
746
747 tmp1 = aVal;
748 tmp1p = phase_Val;
749
750 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
751 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
752 tmp2 = _mm256_mul_ps(aVal, yh);
753 tmp2p = _mm256_mul_ps(phase_Val, yhp);
754
755 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
756 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
757
758 _mm256_storeu_ps((float*)cPtr, z);
759
760 aPtr += 4;
761 cPtr += 4;
762 }
763 if (i) {
764 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
765 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
766 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
767 tmp2 = _mm256_sqrt_ps(tmp1);
768 phase_Val = _mm256_div_ps(phase_Val, tmp2);
769 }
770
771 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
772 for (i = 0; i < num_points % 4; ++i) {
773 *cPtr++ = *aPtr++ * phase_Ptr[0];
774 phase_Ptr[0] *= (*phase_inc);
775 }
776
777 (*phase) = phase_Ptr[0];
778}
779
780#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
781
782/* Note on the RVV implementation:
783 * The complex multiply was expanded, because we don't care about the corner cases.
784 * Otherwise, without -ffast-math, the compiler would inserts function calls,
785 * which invalidates all vector registers and spills them on each loop iteration. */
786
787#ifdef LV_HAVE_RVV
788#include <riscv_vector.h>
789
790static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector,
791 const lv_32fc_t* inVector,
792 const lv_32fc_t* phase_inc,
793 lv_32fc_t* phase,
794 unsigned int num_points)
795{
796 size_t vlmax = __riscv_vsetvlmax_e32m2();
797 vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
798
799 lv_32fc_t inc = 1.0f;
800 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
801 for (size_t i = 0; i < vlmax; ++i) {
802 lv_32fc_t ph =
803 lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
804 lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
805 phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
806 phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
807 inc = lv_cmake(
808 lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
809 lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
810 }
811 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
812 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
813
814 size_t vl = 0;
815 if (num_points > 0)
816 while (1) {
817 size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
818 num_points -= n;
819
820 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
821 // vl<vlmax can only happen on the last iteration of the loops
822 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
823
824 vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)inVector, vl);
825 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
826 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
827
828 vfloat32m2_t vr =
829 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
830 vfloat32m2_t vi =
831 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
832
833 vuint32m2_t vru = __riscv_vreinterpret_u32m2(vr);
834 vuint32m2_t viu = __riscv_vreinterpret_u32m2(vi);
835 vuint64m4_t res =
836 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
837 __riscv_vse64((uint64_t*)outVector, res, vl);
838
839 vfloat32m2_t tmp = phr;
840 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
841 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
842 }
843
844 if (num_points <= 0)
845 break;
846
847 // normalize
848 vfloat32m2_t scale =
849 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
850 scale = __riscv_vfsqrt(scale, vl);
851 phr = __riscv_vfdiv(phr, scale, vl);
852 phi = __riscv_vfdiv(phi, scale, vl);
853 }
854
855 lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
856 for (size_t i = 0; i < vlmax - vl; ++i) {
857 ph /= *phase_inc; // we're going backwards
858 }
859 *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
860}
861#endif /*LV_HAVE_RVV*/
862
863#ifdef LV_HAVE_RVVSEG
864#include <riscv_vector.h>
865
866static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector,
867 const lv_32fc_t* inVector,
868 const lv_32fc_t* phase_inc,
869 lv_32fc_t* phase,
870 unsigned int num_points)
871{
872 size_t vlmax = __riscv_vsetvlmax_e32m2();
873 vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
874
875 lv_32fc_t inc = 1.0f;
876 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
877 for (size_t i = 0; i < vlmax; ++i) {
878 lv_32fc_t ph =
879 lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
880 lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
881 phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
882 phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
883 inc = lv_cmake(
884 lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
885 lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
886 }
887 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
888 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
889
890 size_t vl = 0;
891 if (num_points > 0)
892 while (1) {
893 size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
894 num_points -= n;
895
896 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
897 // vl<vlmax can only happen on the last iteration of the loops
898 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
899
900 vfloat32m2x2_t va =
901 __riscv_vlseg2e32_v_f32m2x2((const float*)inVector, vl);
902 vfloat32m2_t var = __riscv_vget_f32m2(va, 0);
903 vfloat32m2_t vai = __riscv_vget_f32m2(va, 1);
904
905 vfloat32m2_t vr =
906 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
907 vfloat32m2_t vi =
908 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
909 vfloat32m2x2_t vc = __riscv_vcreate_v_f32m2x2(vr, vi);
910 __riscv_vsseg2e32_v_f32m2x2((float*)outVector, vc, vl);
911
912 vfloat32m2_t tmp = phr;
913 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
914 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
915 }
916
917 if (num_points <= 0)
918 break;
919
920 // normalize
921 vfloat32m2_t scale =
922 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
923 scale = __riscv_vfsqrt(scale, vl);
924 phr = __riscv_vfdiv(phr, scale, vl);
925 phi = __riscv_vfdiv(phi, scale, vl);
926 }
927
928 lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
929 for (size_t i = 0; i < vlmax - vl; ++i) {
930 ph /= *phase_inc; // we're going backwards
931 }
932 *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
933}
934#endif /*LV_HAVE_RVVSEG*/
935
936#endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */
static void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:83
static void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:420
static void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:495
#define ROTATOR_RELOAD_4
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:78
#define ROTATOR_RELOAD_2
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:77
#define ROTATOR_RELOAD
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:76
static void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition volk_32fc_s32fc_x2_rotator2_32fc.h:116
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:57
static __m256 _mm256_normalize_ps(__m256 val)
Definition volk_avx_intrinsics.h:89
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition volk_neon_intrinsics.h:83
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition volk_neon_intrinsics.h:105
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition volk_neon_intrinsics.h:73