Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
49
50#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52
53#include <stdio.h>
54#include <volk/volk_common.h>
55
56#ifdef LV_HAVE_GENERIC
57
59 const lv_32fc_t* input,
60 const float* taps,
61 unsigned int num_points)
62{
63
64 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
65 const float* aPtr = (float*)input;
66 const float* bPtr = taps;
67 unsigned int number = 0;
68
69 for (number = 0; number < num_points; number++) {
70 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
71 aPtr += 2;
72 bPtr += 1;
73 }
74
75 *result = returnValue;
76}
77
78#endif /*LV_HAVE_GENERIC*/
79
80#if LV_HAVE_AVX2 && LV_HAVE_FMA
81
82#include <immintrin.h>
83
84static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
85 const lv_32fc_t* input,
86 const float* taps,
87 unsigned int num_points)
88{
89
90 unsigned int number = 0;
91 const unsigned int sixteenthPoints = num_points / 16;
92
93 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
94 const float* aPtr = (float*)input;
95 const float* bPtr = taps;
96
97 __m256 a0Val, a1Val, a2Val, a3Val;
98 __m256 b0Val, b1Val, b2Val, b3Val;
99 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
100
101 __m256 dotProdVal0 = _mm256_setzero_ps();
102 __m256 dotProdVal1 = _mm256_setzero_ps();
103 __m256 dotProdVal2 = _mm256_setzero_ps();
104 __m256 dotProdVal3 = _mm256_setzero_ps();
105
106 for (; number < sixteenthPoints; number++) {
107
108 a0Val = _mm256_load_ps(aPtr);
109 a1Val = _mm256_load_ps(aPtr + 8);
110 a2Val = _mm256_load_ps(aPtr + 16);
111 a3Val = _mm256_load_ps(aPtr + 24);
112
113 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
114 x1Val = _mm256_load_ps(bPtr + 8);
115 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
116 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
117 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
118 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
119
120 // TODO: it may be possible to rearrange swizzling to better pipeline data
121 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
122 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
123 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
124 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
125
126 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
127 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
128 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
129 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
130
131 aPtr += 32;
132 bPtr += 16;
133 }
134
135 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
136 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
137 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
138
139 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
140
141 _mm256_store_ps(dotProductVector,
142 dotProdVal0); // Store the results back into the dot product vector
143
144 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
145 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
146 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
147 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
148
149 number = sixteenthPoints * 16;
150 for (; number < num_points; number++) {
151 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
152 aPtr += 2;
153 bPtr += 1;
154 }
155
156 *result = returnValue;
157}
158
159#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
160
161#ifdef LV_HAVE_AVX
162
163#include <immintrin.h>
164
166 const lv_32fc_t* input,
167 const float* taps,
168 unsigned int num_points)
169{
170
171 unsigned int number = 0;
172 const unsigned int sixteenthPoints = num_points / 16;
173
174 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
175 const float* aPtr = (float*)input;
176 const float* bPtr = taps;
177
178 __m256 a0Val, a1Val, a2Val, a3Val;
179 __m256 b0Val, b1Val, b2Val, b3Val;
180 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
181 __m256 c0Val, c1Val, c2Val, c3Val;
182
183 __m256 dotProdVal0 = _mm256_setzero_ps();
184 __m256 dotProdVal1 = _mm256_setzero_ps();
185 __m256 dotProdVal2 = _mm256_setzero_ps();
186 __m256 dotProdVal3 = _mm256_setzero_ps();
187
188 for (; number < sixteenthPoints; number++) {
189
190 a0Val = _mm256_load_ps(aPtr);
191 a1Val = _mm256_load_ps(aPtr + 8);
192 a2Val = _mm256_load_ps(aPtr + 16);
193 a3Val = _mm256_load_ps(aPtr + 24);
194
195 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
196 x1Val = _mm256_load_ps(bPtr + 8);
197 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
198 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
199 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
200 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
201
202 // TODO: it may be possible to rearrange swizzling to better pipeline data
203 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
204 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
205 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
206 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
207
208 c0Val = _mm256_mul_ps(a0Val, b0Val);
209 c1Val = _mm256_mul_ps(a1Val, b1Val);
210 c2Val = _mm256_mul_ps(a2Val, b2Val);
211 c3Val = _mm256_mul_ps(a3Val, b3Val);
212
213 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
214 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
215 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
216 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
217
218 aPtr += 32;
219 bPtr += 16;
220 }
221
222 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
223 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
224 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
225
226 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
227
228 _mm256_store_ps(dotProductVector,
229 dotProdVal0); // Store the results back into the dot product vector
230
231 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
232 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
233 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
234 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
235
236 number = sixteenthPoints * 16;
237 for (; number < num_points; number++) {
238 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
239 aPtr += 2;
240 bPtr += 1;
241 }
242
243 *result = returnValue;
244}
245
246#endif /*LV_HAVE_AVX*/
247
248
249#ifdef LV_HAVE_SSE
250
251
253 const lv_32fc_t* input,
254 const float* taps,
255 unsigned int num_points)
256{
257
258 unsigned int number = 0;
259 const unsigned int eighthPoints = num_points / 8;
260
261 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
262 const float* aPtr = (float*)input;
263 const float* bPtr = taps;
264
265 __m128 a0Val, a1Val, a2Val, a3Val;
266 __m128 b0Val, b1Val, b2Val, b3Val;
267 __m128 x0Val, x1Val, x2Val, x3Val;
268 __m128 c0Val, c1Val, c2Val, c3Val;
269
270 __m128 dotProdVal0 = _mm_setzero_ps();
271 __m128 dotProdVal1 = _mm_setzero_ps();
272 __m128 dotProdVal2 = _mm_setzero_ps();
273 __m128 dotProdVal3 = _mm_setzero_ps();
274
275 for (; number < eighthPoints; number++) {
276
277 a0Val = _mm_load_ps(aPtr);
278 a1Val = _mm_load_ps(aPtr + 4);
279 a2Val = _mm_load_ps(aPtr + 8);
280 a3Val = _mm_load_ps(aPtr + 12);
281
282 x0Val = _mm_load_ps(bPtr);
283 x1Val = _mm_load_ps(bPtr);
284 x2Val = _mm_load_ps(bPtr + 4);
285 x3Val = _mm_load_ps(bPtr + 4);
286 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
287 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
288 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
289 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
290
291 c0Val = _mm_mul_ps(a0Val, b0Val);
292 c1Val = _mm_mul_ps(a1Val, b1Val);
293 c2Val = _mm_mul_ps(a2Val, b2Val);
294 c3Val = _mm_mul_ps(a3Val, b3Val);
295
296 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
297 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
298 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
299 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
300
301 aPtr += 16;
302 bPtr += 8;
303 }
304
305 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
306 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
307 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
308
309 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
310
311 _mm_store_ps(dotProductVector,
312 dotProdVal0); // Store the results back into the dot product vector
313
314 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
315 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
316
317 number = eighthPoints * 8;
318 for (; number < num_points; number++) {
319 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
320 aPtr += 2;
321 bPtr += 1;
322 }
323
324 *result = returnValue;
325}
326
327#endif /*LV_HAVE_SSE*/
328
329#if LV_HAVE_AVX2 && LV_HAVE_FMA
330
331#include <immintrin.h>
332
333static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
334 const lv_32fc_t* input,
335 const float* taps,
336 unsigned int num_points)
337{
338
339 unsigned int number = 0;
340 const unsigned int sixteenthPoints = num_points / 16;
341
342 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
343 const float* aPtr = (float*)input;
344 const float* bPtr = taps;
345
346 __m256 a0Val, a1Val, a2Val, a3Val;
347 __m256 b0Val, b1Val, b2Val, b3Val;
348 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
349
350 __m256 dotProdVal0 = _mm256_setzero_ps();
351 __m256 dotProdVal1 = _mm256_setzero_ps();
352 __m256 dotProdVal2 = _mm256_setzero_ps();
353 __m256 dotProdVal3 = _mm256_setzero_ps();
354
355 for (; number < sixteenthPoints; number++) {
356
357 a0Val = _mm256_loadu_ps(aPtr);
358 a1Val = _mm256_loadu_ps(aPtr + 8);
359 a2Val = _mm256_loadu_ps(aPtr + 16);
360 a3Val = _mm256_loadu_ps(aPtr + 24);
361
362 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
363 x1Val = _mm256_loadu_ps(bPtr + 8);
364 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
365 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
366 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
367 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
368
369 // TODO: it may be possible to rearrange swizzling to better pipeline data
370 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
371 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
372 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
373 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
374
375 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
376 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
377 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
378 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
379
380 aPtr += 32;
381 bPtr += 16;
382 }
383
384 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
385 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
386 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
387
388 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
389
390 _mm256_store_ps(dotProductVector,
391 dotProdVal0); // Store the results back into the dot product vector
392
393 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
394 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
395 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
396 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
397
398 number = sixteenthPoints * 16;
399 for (; number < num_points; number++) {
400 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
401 aPtr += 2;
402 bPtr += 1;
403 }
404
405 *result = returnValue;
406}
407
408#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
409
410#ifdef LV_HAVE_AVX
411
412#include <immintrin.h>
413
415 const lv_32fc_t* input,
416 const float* taps,
417 unsigned int num_points)
418{
419
420 unsigned int number = 0;
421 const unsigned int sixteenthPoints = num_points / 16;
422
423 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
424 const float* aPtr = (float*)input;
425 const float* bPtr = taps;
426
427 __m256 a0Val, a1Val, a2Val, a3Val;
428 __m256 b0Val, b1Val, b2Val, b3Val;
429 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
430 __m256 c0Val, c1Val, c2Val, c3Val;
431
432 __m256 dotProdVal0 = _mm256_setzero_ps();
433 __m256 dotProdVal1 = _mm256_setzero_ps();
434 __m256 dotProdVal2 = _mm256_setzero_ps();
435 __m256 dotProdVal3 = _mm256_setzero_ps();
436
437 for (; number < sixteenthPoints; number++) {
438
439 a0Val = _mm256_loadu_ps(aPtr);
440 a1Val = _mm256_loadu_ps(aPtr + 8);
441 a2Val = _mm256_loadu_ps(aPtr + 16);
442 a3Val = _mm256_loadu_ps(aPtr + 24);
443
444 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
445 x1Val = _mm256_loadu_ps(bPtr + 8);
446 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
447 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
448 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
449 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
450
451 // TODO: it may be possible to rearrange swizzling to better pipeline data
452 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
453 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
454 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
455 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
456
457 c0Val = _mm256_mul_ps(a0Val, b0Val);
458 c1Val = _mm256_mul_ps(a1Val, b1Val);
459 c2Val = _mm256_mul_ps(a2Val, b2Val);
460 c3Val = _mm256_mul_ps(a3Val, b3Val);
461
462 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
463 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
464 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
465 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
466
467 aPtr += 32;
468 bPtr += 16;
469 }
470
471 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
472 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
473 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
474
475 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
476
477 _mm256_store_ps(dotProductVector,
478 dotProdVal0); // Store the results back into the dot product vector
479
480 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
481 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
482 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
483 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
484
485 number = sixteenthPoints * 16;
486 for (; number < num_points; number++) {
487 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
488 aPtr += 2;
489 bPtr += 1;
490 }
491
492 *result = returnValue;
493}
494#endif /*LV_HAVE_AVX*/
495
496#ifdef LV_HAVE_NEON
497#include <arm_neon.h>
498
499static inline void
501 const lv_32fc_t* __restrict input,
502 const float* __restrict taps,
503 unsigned int num_points)
504{
505
506 unsigned int number;
507 const unsigned int quarterPoints = num_points / 8;
508
509 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
510 const float* inputPtr = (float*)input;
511 const float* tapsPtr = taps;
512 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
513 float accVector_real[4];
514 float accVector_imag[4];
515
516 float32x4x2_t inputVector0, inputVector1;
517 float32x4_t tapsVector0, tapsVector1;
518 float32x4_t tmp_real0, tmp_imag0;
519 float32x4_t tmp_real1, tmp_imag1;
520 float32x4_t real_accumulator0, imag_accumulator0;
521 float32x4_t real_accumulator1, imag_accumulator1;
522
523 // zero out accumulators
524 // take a *float, return float32x4_t
525 real_accumulator0 = vld1q_f32(zero);
526 imag_accumulator0 = vld1q_f32(zero);
527 real_accumulator1 = vld1q_f32(zero);
528 imag_accumulator1 = vld1q_f32(zero);
529
530 for (number = 0; number < quarterPoints; number++) {
531 // load doublewords and duplicate in to second lane
532 tapsVector0 = vld1q_f32(tapsPtr);
533 tapsVector1 = vld1q_f32(tapsPtr + 4);
534
535 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
536 inputVector0 = vld2q_f32(inputPtr);
537 inputVector1 = vld2q_f32(inputPtr + 8);
538 // inputVector is now a struct of two vectors, 0th is real, 1st is imag
539
540 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
541 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
542
543 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
544 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
545
546 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
547 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
548
549 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
550 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
551
552 tapsPtr += 8;
553 inputPtr += 16;
554 }
555
556 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
557 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
558 // void vst1q_f32( float32_t * ptr, float32x4_t val);
559 // store results back to a complex (array of 2 floats)
560 vst1q_f32(accVector_real, real_accumulator0);
561 vst1q_f32(accVector_imag, imag_accumulator0);
562 returnValue += lv_cmake(
563 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
564 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
565
566 // clean up the remainder
567 for (number = quarterPoints * 8; number < num_points; number++) {
568 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
569 inputPtr += 2;
570 tapsPtr += 1;
571 }
572
573 *result = returnValue;
574}
575
576#endif /*LV_HAVE_NEON*/
577
578#ifdef LV_HAVE_NEON
579#include <arm_neon.h>
580
581static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
582 const lv_32fc_t* __restrict input,
583 const float* __restrict taps,
584 unsigned int num_points)
585{
586
587 unsigned int number;
588 const unsigned int quarterPoints = num_points / 4;
589
590 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
591 const float* inputPtr = (float*)input;
592 const float* tapsPtr = taps;
593 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
594 float accVector_real[4];
595 float accVector_imag[4];
596
597 float32x4x2_t inputVector;
598 float32x4_t tapsVector;
599 float32x4_t tmp_real, tmp_imag;
600 float32x4_t real_accumulator, imag_accumulator;
601
602
603 // zero out accumulators
604 // take a *float, return float32x4_t
605 real_accumulator = vld1q_f32(zero);
606 imag_accumulator = vld1q_f32(zero);
607
608 for (number = 0; number < quarterPoints; number++) {
609 // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
610 // load doublewords and duplicate in to second lane
611 tapsVector = vld1q_f32(tapsPtr);
612
613 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
614 inputVector = vld2q_f32(inputPtr);
615
616 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
617 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
618
619 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
620 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
621
622
623 tapsPtr += 4;
624 inputPtr += 8;
625 }
626
627 // store results back to a complex (array of 2 floats)
628 vst1q_f32(accVector_real, real_accumulator);
629 vst1q_f32(accVector_imag, imag_accumulator);
630 returnValue += lv_cmake(
631 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
632 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
633
634 // clean up the remainder
635 for (number = quarterPoints * 4; number < num_points; number++) {
636 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
637 inputPtr += 2;
638 tapsPtr += 1;
639 }
640
641 *result = returnValue;
642}
643
644#endif /*LV_HAVE_NEON*/
645
646#ifdef LV_HAVE_NEONV7
647extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
648 const lv_32fc_t* input,
649 const float* taps,
650 unsigned int num_points);
651#endif /*LV_HAVE_NEONV7*/
652
653#ifdef LV_HAVE_NEONV7
654extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
655 const lv_32fc_t* input,
656 const float* taps,
657 unsigned int num_points);
658#endif /*LV_HAVE_NEONV7*/
659
660#ifdef LV_HAVE_NEONV7
661extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
662 const lv_32fc_t* input,
663 const float* taps,
664 unsigned int num_points);
665#endif /*LV_HAVE_NEONV7*/
666
667#ifdef LV_HAVE_SSE
668
670 const lv_32fc_t* input,
671 const float* taps,
672 unsigned int num_points)
673{
674
675 unsigned int number = 0;
676 const unsigned int eighthPoints = num_points / 8;
677
678 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
679 const float* aPtr = (float*)input;
680 const float* bPtr = taps;
681
682 __m128 a0Val, a1Val, a2Val, a3Val;
683 __m128 b0Val, b1Val, b2Val, b3Val;
684 __m128 x0Val, x1Val, x2Val, x3Val;
685 __m128 c0Val, c1Val, c2Val, c3Val;
686
687 __m128 dotProdVal0 = _mm_setzero_ps();
688 __m128 dotProdVal1 = _mm_setzero_ps();
689 __m128 dotProdVal2 = _mm_setzero_ps();
690 __m128 dotProdVal3 = _mm_setzero_ps();
691
692 for (; number < eighthPoints; number++) {
693
694 a0Val = _mm_loadu_ps(aPtr);
695 a1Val = _mm_loadu_ps(aPtr + 4);
696 a2Val = _mm_loadu_ps(aPtr + 8);
697 a3Val = _mm_loadu_ps(aPtr + 12);
698
699 x0Val = _mm_loadu_ps(bPtr);
700 x1Val = _mm_loadu_ps(bPtr);
701 x2Val = _mm_loadu_ps(bPtr + 4);
702 x3Val = _mm_loadu_ps(bPtr + 4);
703 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
704 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
705 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
706 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
707
708 c0Val = _mm_mul_ps(a0Val, b0Val);
709 c1Val = _mm_mul_ps(a1Val, b1Val);
710 c2Val = _mm_mul_ps(a2Val, b2Val);
711 c3Val = _mm_mul_ps(a3Val, b3Val);
712
713 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
714 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
715 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
716 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
717
718 aPtr += 16;
719 bPtr += 8;
720 }
721
722 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
723 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
724 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
725
726 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
727
728 _mm_store_ps(dotProductVector,
729 dotProdVal0); // Store the results back into the dot product vector
730
731 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
732 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
733
734 number = eighthPoints * 8;
735 for (; number < num_points; number++) {
736 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
737 aPtr += 2;
738 bPtr += 1;
739 }
740
741 *result = returnValue;
742}
743
744#endif /*LV_HAVE_SSE*/
745
746#ifdef LV_HAVE_RVV
747#include <riscv_vector.h>
749
750static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result,
751 const lv_32fc_t* input,
752 const float* taps,
753 unsigned int num_points)
754{
755 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
756 vfloat32m4_t vsumi = vsumr;
757 size_t n = num_points;
758 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
759 vl = __riscv_vsetvl_e32m4(n);
760 vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl);
761 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
762 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
763 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
764 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
765 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
766 }
767 size_t vl = __riscv_vsetvlmax_e32m1();
768 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
769 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
770 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
771 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
772 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
773}
774#endif /*LV_HAVE_RVV*/
775
776#ifdef LV_HAVE_RVVSEG
777#include <riscv_vector.h>
779
780static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result,
781 const lv_32fc_t* input,
782 const float* taps,
783 unsigned int num_points)
784{
785 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
786 vfloat32m4_t vsumi = vsumr;
787 size_t n = num_points;
788 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
789 vl = __riscv_vsetvl_e32m4(n);
790 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
791 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
792 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
793 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
794 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
795 }
796 size_t vl = __riscv_vsetvlmax_e32m1();
797 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
798 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
799 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
800 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
801 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
802}
803#endif /*LV_HAVE_RVVSEG*/
804
805#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:414
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:252
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:500
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:581
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:58
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:165
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition volk_32fc_32f_dot_prod_32fc.h:669
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cmake(r, i)
Definition volk_complex.h:77
float complex lv_32fc_t
Definition volk_complex.h:74
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24