Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47
48#include <stdio.h>
49#include <string.h>
50#include <volk/volk_common.h>
51#include <volk/volk_complex.h>
52
53
54#ifdef LV_HAVE_RISCV64
55extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result,
56 const lv_32fc_t* input,
57 const lv_32fc_t* taps,
58 unsigned int num_points);
59#endif
60
61#ifdef LV_HAVE_GENERIC
62
63
65 const lv_32fc_t* input,
66 const lv_32fc_t* taps,
67 unsigned int num_points)
68{
69
70 float* res = (float*)result;
71 float* in = (float*)input;
72 float* tp = (float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
74
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
77 unsigned int i = 0;
78
79 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
84
85 in += 4;
86 tp += 4;
87 }
88
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
91
92 // Cleanup if we had an odd number of points
93 if (num_points & 1) {
94 *result += input[num_points - 1] * taps[num_points - 1];
95 }
96}
97
98#endif /*LV_HAVE_GENERIC*/
99
100
101#ifdef LV_HAVE_SSE3
102
103#include <pmmintrin.h>
104
106 const lv_32fc_t* input,
107 const lv_32fc_t* taps,
108 unsigned int num_points)
109{
110
111 lv_32fc_t dotProduct;
112 memset(&dotProduct, 0x0, 2 * sizeof(float));
113
114 unsigned int number = 0;
115 const unsigned int halfPoints = num_points / 2;
116 unsigned int isodd = num_points & 1;
117
118 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
119
120 const lv_32fc_t* a = input;
121 const lv_32fc_t* b = taps;
122
123 dotProdVal = _mm_setzero_ps();
124
125 for (; number < halfPoints; number++) {
126
127 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
128 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
129
130 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
131 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
132
133 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
134
135 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
136
137 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
138
139 z = _mm_addsub_ps(tmp1,
140 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
141
142 dotProdVal =
143 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
144
145 a += 2;
146 b += 2;
147 }
148
149 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
150
151 _mm_storeu_ps((float*)dotProductVector,
152 dotProdVal); // Store the results back into the dot product vector
153
154 dotProduct += (dotProductVector[0] + dotProductVector[1]);
155
156 if (isodd) {
157 dotProduct += input[num_points - 1] * taps[num_points - 1];
158 }
159
160 *result = dotProduct;
161}
162
163#endif /*LV_HAVE_SSE3*/
164
165#ifdef LV_HAVE_AVX
166
167#include <immintrin.h>
168
170 const lv_32fc_t* input,
171 const lv_32fc_t* taps,
172 unsigned int num_points)
173{
174
175 unsigned int isodd = num_points & 3;
176 unsigned int i = 0;
177 lv_32fc_t dotProduct;
178 memset(&dotProduct, 0x0, 2 * sizeof(float));
179
180 unsigned int number = 0;
181 const unsigned int quarterPoints = num_points / 4;
182
183 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
184
185 const lv_32fc_t* a = input;
186 const lv_32fc_t* b = taps;
187
188 dotProdVal = _mm256_setzero_ps();
189
190 for (; number < quarterPoints; number++) {
191 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
192 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
193
194 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
195 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
196
197 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
198
199 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
200
201 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
202
203 z = _mm256_addsub_ps(tmp1,
204 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
205
206 dotProdVal = _mm256_add_ps(dotProdVal,
207 z); // Add the complex multiplication results together
208
209 a += 4;
210 b += 4;
211 }
212
213 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
214
215 _mm256_storeu_ps((float*)dotProductVector,
216 dotProdVal); // Store the results back into the dot product vector
217
218 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219 dotProductVector[3]);
220
221 for (i = num_points - isodd; i < num_points; i++) {
222 dotProduct += input[i] * taps[i];
223 }
224
225 *result = dotProduct;
226}
227
228#endif /*LV_HAVE_AVX*/
229
230#if LV_HAVE_AVX && LV_HAVE_FMA
231#include <immintrin.h>
232
233static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
234 const lv_32fc_t* input,
235 const lv_32fc_t* taps,
236 unsigned int num_points)
237{
238
239 unsigned int isodd = num_points & 3;
240 unsigned int i = 0;
241 lv_32fc_t dotProduct;
242 memset(&dotProduct, 0x0, 2 * sizeof(float));
243
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
246
247 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
248
249 const lv_32fc_t* a = input;
250 const lv_32fc_t* b = taps;
251
252 dotProdVal = _mm256_setzero_ps();
253
254 for (; number < quarterPoints; number++) {
255
256 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
257 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
258
259 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
260 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
261
262 tmp1 = x;
263
264 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
265
266 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
267
268 z = _mm256_fmaddsub_ps(
269 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
270
271 dotProdVal = _mm256_add_ps(dotProdVal,
272 z); // Add the complex multiplication results together
273
274 a += 4;
275 b += 4;
276 }
277
278 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
279
280 _mm256_storeu_ps((float*)dotProductVector,
281 dotProdVal); // Store the results back into the dot product vector
282
283 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
284 dotProductVector[3]);
285
286 for (i = num_points - isodd; i < num_points; i++) {
287 dotProduct += input[i] * taps[i];
288 }
289
290 *result = dotProduct;
291}
292
293#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
294
295#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
296
297#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
298#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
299
300#include <stdio.h>
301#include <string.h>
302#include <volk/volk_common.h>
303#include <volk/volk_complex.h>
304
305
306#ifdef LV_HAVE_SSE3
307
308#include <pmmintrin.h>
309
311 const lv_32fc_t* input,
312 const lv_32fc_t* taps,
313 unsigned int num_points)
314{
315
316 const unsigned int num_bytes = num_points * 8;
317 unsigned int isodd = num_points & 1;
318
319 lv_32fc_t dotProduct;
320 memset(&dotProduct, 0x0, 2 * sizeof(float));
321
322 unsigned int number = 0;
323 const unsigned int halfPoints = num_bytes >> 4;
324
325 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
326
327 const lv_32fc_t* a = input;
328 const lv_32fc_t* b = taps;
329
330 dotProdVal = _mm_setzero_ps();
331
332 for (; number < halfPoints; number++) {
333
334 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
335 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
336
337 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
338 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
339
340 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
341
342 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
343
344 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
345
346 z = _mm_addsub_ps(tmp1,
347 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
348
349 dotProdVal =
350 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
351
352 a += 2;
353 b += 2;
354 }
355
356 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
357
358 _mm_store_ps((float*)dotProductVector,
359 dotProdVal); // Store the results back into the dot product vector
360
361 dotProduct += (dotProductVector[0] + dotProductVector[1]);
362
363 if (isodd) {
364 dotProduct += input[num_points - 1] * taps[num_points - 1];
365 }
366
367 *result = dotProduct;
368}
369
370#endif /*LV_HAVE_SSE3*/
371
372
373#ifdef LV_HAVE_NEON
374#include <arm_neon.h>
375
377 const lv_32fc_t* input,
378 const lv_32fc_t* taps,
379 unsigned int num_points)
380{
381
382 unsigned int quarter_points = num_points / 4;
383 unsigned int number;
384
385 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
386 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
387 // for 2-lane vectors, 1st lane holds the real part,
388 // 2nd lane holds the imaginary part
389 float32x4x2_t a_val, b_val, c_val, accumulator;
390 float32x4x2_t tmp_real, tmp_imag;
391 accumulator.val[0] = vdupq_n_f32(0);
392 accumulator.val[1] = vdupq_n_f32(0);
393
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
396 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
397 __VOLK_PREFETCH(a_ptr + 8);
398 __VOLK_PREFETCH(b_ptr + 8);
399
400 // multiply the real*real and imag*imag to get real result
401 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
402 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
403 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
404 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
405
406 // Multiply cross terms to get the imaginary result
407 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
408 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
409 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
410 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
411
412 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
414
415 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
417
418 a_ptr += 4;
419 b_ptr += 4;
420 }
421 lv_32fc_t accum_result[4];
422 vst2q_f32((float*)accum_result, accumulator);
423 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
424
425 // tail case
426 for (number = quarter_points * 4; number < num_points; ++number) {
427 *result += (*a_ptr++) * (*b_ptr++);
428 }
429}
430#endif /*LV_HAVE_NEON*/
431
432#ifdef LV_HAVE_NEON
433#include <arm_neon.h>
435 const lv_32fc_t* input,
436 const lv_32fc_t* taps,
437 unsigned int num_points)
438{
439
440 unsigned int quarter_points = num_points / 4;
441 unsigned int number;
442
443 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
444 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
445 // for 2-lane vectors, 1st lane holds the real part,
446 // 2nd lane holds the imaginary part
447 float32x4x2_t a_val, b_val, accumulator;
448 float32x4x2_t tmp_imag;
449 accumulator.val[0] = vdupq_n_f32(0);
450 accumulator.val[1] = vdupq_n_f32(0);
451
452 for (number = 0; number < quarter_points; ++number) {
453 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
454 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
455 __VOLK_PREFETCH(a_ptr + 8);
456 __VOLK_PREFETCH(b_ptr + 8);
457
458 // do the first multiply
459 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
461
462 // use multiply accumulate/subtract to get result
463 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
465
466 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
468
469 // increment pointers
470 a_ptr += 4;
471 b_ptr += 4;
472 }
473 lv_32fc_t accum_result[4];
474 vst2q_f32((float*)accum_result, accumulator);
475 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
476
477 // tail case
478 for (number = quarter_points * 4; number < num_points; ++number) {
479 *result += (*a_ptr++) * (*b_ptr++);
480 }
481}
482#endif /*LV_HAVE_NEON*/
483
484#ifdef LV_HAVE_NEON
486 const lv_32fc_t* input,
487 const lv_32fc_t* taps,
488 unsigned int num_points)
489{
490
491 unsigned int quarter_points = num_points / 4;
492 unsigned int number;
493
494 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
495 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
496 // for 2-lane vectors, 1st lane holds the real part,
497 // 2nd lane holds the imaginary part
498 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499 accumulator1.val[0] = vdupq_n_f32(0);
500 accumulator1.val[1] = vdupq_n_f32(0);
501 accumulator2.val[0] = vdupq_n_f32(0);
502 accumulator2.val[1] = vdupq_n_f32(0);
503
504 for (number = 0; number < quarter_points; ++number) {
505 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
506 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
507 __VOLK_PREFETCH(a_ptr + 8);
508 __VOLK_PREFETCH(b_ptr + 8);
509
510 // use 2 accumulators to remove inter-instruction data dependencies
511 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
515 // increment pointers
516 a_ptr += 4;
517 b_ptr += 4;
518 }
519 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
521 lv_32fc_t accum_result[4];
522 vst2q_f32((float*)accum_result, accumulator1);
523 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
524
525 // tail case
526 for (number = quarter_points * 4; number < num_points; ++number) {
527 *result += (*a_ptr++) * (*b_ptr++);
528 }
529}
530#endif /*LV_HAVE_NEON*/
531
532#ifdef LV_HAVE_NEON
534 const lv_32fc_t* input,
535 const lv_32fc_t* taps,
536 unsigned int num_points)
537{
538 // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
539 // fast
540
541 unsigned int quarter_points = num_points / 8;
542 unsigned int number;
543
544 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
545 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
546 // for 2-lane vectors, 1st lane holds the real part,
547 // 2nd lane holds the imaginary part
548 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549 float32x4x2_t reduced_accumulator;
550 accumulator1.val[0] = vdupq_n_f32(0);
551 accumulator1.val[1] = vdupq_n_f32(0);
552 accumulator1.val[2] = vdupq_n_f32(0);
553 accumulator1.val[3] = vdupq_n_f32(0);
554 accumulator2.val[0] = vdupq_n_f32(0);
555 accumulator2.val[1] = vdupq_n_f32(0);
556 accumulator2.val[2] = vdupq_n_f32(0);
557 accumulator2.val[3] = vdupq_n_f32(0);
558
559 // 8 input regs, 8 accumulators -> 16/16 neon regs are used
560 for (number = 0; number < quarter_points; ++number) {
561 a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
562 b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
563 __VOLK_PREFETCH(a_ptr + 8);
564 __VOLK_PREFETCH(b_ptr + 8);
565
566 // use 2 accumulators to remove inter-instruction data dependencies
567 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
569
570 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
572
573 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
575
576 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
578 // increment pointers
579 a_ptr += 8;
580 b_ptr += 8;
581 }
582 // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
583 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
589 // now reduce accumulators to scalars
590 lv_32fc_t accum_result[4];
591 vst2q_f32((float*)accum_result, reduced_accumulator);
592 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
593
594 // tail case
595 for (number = quarter_points * 8; number < num_points; ++number) {
596 *result += (*a_ptr++) * (*b_ptr++);
597 }
598}
599#endif /*LV_HAVE_NEON*/
600
601
602#ifdef LV_HAVE_AVX
603
604#include <immintrin.h>
605
607 const lv_32fc_t* input,
608 const lv_32fc_t* taps,
609 unsigned int num_points)
610{
611
612 unsigned int isodd = num_points & 3;
613 unsigned int i = 0;
614 lv_32fc_t dotProduct;
615 memset(&dotProduct, 0x0, 2 * sizeof(float));
616
617 unsigned int number = 0;
618 const unsigned int quarterPoints = num_points / 4;
619
620 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
621
622 const lv_32fc_t* a = input;
623 const lv_32fc_t* b = taps;
624
625 dotProdVal = _mm256_setzero_ps();
626
627 for (; number < quarterPoints; number++) {
628
629 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
630 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
631
632 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
633 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
634
635 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
636
637 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
638
639 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
640
641 z = _mm256_addsub_ps(tmp1,
642 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
643
644 dotProdVal = _mm256_add_ps(dotProdVal,
645 z); // Add the complex multiplication results together
646
647 a += 4;
648 b += 4;
649 }
650
651 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
652
653 _mm256_store_ps((float*)dotProductVector,
654 dotProdVal); // Store the results back into the dot product vector
655
656 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
657 dotProductVector[3]);
658
659 for (i = num_points - isodd; i < num_points; i++) {
660 dotProduct += input[i] * taps[i];
661 }
662
663 *result = dotProduct;
664}
665
666#endif /*LV_HAVE_AVX*/
667
668#if LV_HAVE_AVX && LV_HAVE_FMA
669#include <immintrin.h>
670
671static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
672 const lv_32fc_t* input,
673 const lv_32fc_t* taps,
674 unsigned int num_points)
675{
676
677 unsigned int isodd = num_points & 3;
678 unsigned int i = 0;
679 lv_32fc_t dotProduct;
680 memset(&dotProduct, 0x0, 2 * sizeof(float));
681
682 unsigned int number = 0;
683 const unsigned int quarterPoints = num_points / 4;
684
685 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
686
687 const lv_32fc_t* a = input;
688 const lv_32fc_t* b = taps;
689
690 dotProdVal = _mm256_setzero_ps();
691
692 for (; number < quarterPoints; number++) {
693
694 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
695 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
696
697 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
698 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
699
700 tmp1 = x;
701
702 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
703
704 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
705
706 z = _mm256_fmaddsub_ps(
707 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
708
709 dotProdVal = _mm256_add_ps(dotProdVal,
710 z); // Add the complex multiplication results together
711
712 a += 4;
713 b += 4;
714 }
715
716 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
717
718 _mm256_store_ps((float*)dotProductVector,
719 dotProdVal); // Store the results back into the dot product vector
720
721 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
722 dotProductVector[3]);
723
724 for (i = num_points - isodd; i < num_points; i++) {
725 dotProduct += input[i] * taps[i];
726 }
727
728 *result = dotProduct;
729}
730
731#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
732
733#ifdef LV_HAVE_RVV
734#include <riscv_vector.h>
736
737static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result,
738 const lv_32fc_t* input,
739 const lv_32fc_t* taps,
740 unsigned int num_points)
741{
742 vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
743 vfloat32m2_t vsumi = vsumr;
744 size_t n = num_points;
745 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746 vl = __riscv_vsetvl_e32m2(n);
747 vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
748 vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
749 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
750 vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
751 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
752 vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
753 vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
754 vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
755 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
756 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
757 }
758 size_t vl = __riscv_vsetvlmax_e32m1();
759 vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
760 vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
761 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
762 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
763 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
764}
765#endif /*LV_HAVE_RVV*/
766
767#ifdef LV_HAVE_RVVSEG
768#include <riscv_vector.h>
770
771static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result,
772 const lv_32fc_t* input,
773 const lv_32fc_t* taps,
774 unsigned int num_points)
775{
776 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
777 vfloat32m4_t vsumi = vsumr;
778 size_t n = num_points;
779 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
780 vl = __riscv_vsetvl_e32m4(n);
781 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
782 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
783 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
784 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
785 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
786 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
787 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
788 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
789 }
790 size_t vl = __riscv_vsetvlmax_e32m1();
791 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
792 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
793 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
794 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
795 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
796}
797#endif /*LV_HAVE_RVVSEG*/
798
799#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:533
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:310
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:606
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:169
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:376
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:64
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:105
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:434
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:485
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cmake(r, i)
Definition volk_complex.h:77
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK2(op, T, S, v)
Definition volk_rvv_intrinsics.h:19
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24