Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47
48#include <stdio.h>
49#include <volk/volk_common.h>
50
51
52#ifdef LV_HAVE_GENERIC
53
55 const short* input,
56 const lv_32fc_t* taps,
57 unsigned int num_points)
58{
59
60 static const int N_UNROLL = 4;
61
62 lv_32fc_t acc0 = 0;
63 lv_32fc_t acc1 = 0;
64 lv_32fc_t acc2 = 0;
65 lv_32fc_t acc3 = 0;
66
67 unsigned i = 0;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69
70 for (i = 0; i < n; i += N_UNROLL) {
71 acc0 += taps[i + 0] * (float)input[i + 0];
72 acc1 += taps[i + 1] * (float)input[i + 1];
73 acc2 += taps[i + 2] * (float)input[i + 2];
74 acc3 += taps[i + 3] * (float)input[i + 3];
75 }
76
77 for (; i < num_points; i++) {
78 acc0 += taps[i] * (float)input[i];
79 }
80
81 *result = acc0 + acc1 + acc2 + acc3;
82}
83
84#endif /*LV_HAVE_GENERIC*/
85
86#ifdef LV_HAVE_NEON
87#include <arm_neon.h>
89 const short* input,
90 const lv_32fc_t* taps,
91 unsigned int num_points)
92{
93
94 unsigned ii;
95 unsigned quarter_points = num_points / 4;
96 lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97 short* inputPtr = (short*)input;
98 lv_32fc_t accumulator_vec[4];
99
100 float32x4x2_t tapsVal, accumulator_val;
101 int16x4_t input16;
102 int32x4_t input32;
103 float32x4_t input_float, prod_re, prod_im;
104
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
107
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
111 // widen 16-bit int to 32-bit int
112 input32 = vmovl_s16(input16);
113 // convert 32-bit int to float with scale
114 input_float = vcvtq_f32_s32(input32);
115
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121
122 tapsPtr += 4;
123 inputPtr += 4;
124 }
125 vst2q_f32((float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
129
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132 }
133
134 *result = accumulator_vec[0];
135}
136
137#endif /*LV_HAVE_NEON*/
138
139#if LV_HAVE_SSE && LV_HAVE_MMX
140
141static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
142 const short* input,
143 const lv_32fc_t* taps,
144 unsigned int num_points)
145{
146
147 unsigned int number = 0;
148 const unsigned int eighthPoints = num_points / 8;
149
150 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
151 const short* aPtr = input;
152 const float* bPtr = (float*)taps;
153
154 __m64 m0, m1;
155 __m128 f0, f1, f2, f3;
156 __m128 a0Val, a1Val, a2Val, a3Val;
157 __m128 b0Val, b1Val, b2Val, b3Val;
158 __m128 c0Val, c1Val, c2Val, c3Val;
159
160 __m128 dotProdVal0 = _mm_setzero_ps();
161 __m128 dotProdVal1 = _mm_setzero_ps();
162 __m128 dotProdVal2 = _mm_setzero_ps();
163 __m128 dotProdVal3 = _mm_setzero_ps();
164
165 for (; number < eighthPoints; number++) {
166
167 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
168 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
169 f0 = _mm_cvtpi16_ps(m0);
170 f1 = _mm_cvtpi16_ps(m0);
171 f2 = _mm_cvtpi16_ps(m1);
172 f3 = _mm_cvtpi16_ps(m1);
173
174 a0Val = _mm_unpacklo_ps(f0, f1);
175 a1Val = _mm_unpackhi_ps(f0, f1);
176 a2Val = _mm_unpacklo_ps(f2, f3);
177 a3Val = _mm_unpackhi_ps(f2, f3);
178
179 b0Val = _mm_loadu_ps(bPtr);
180 b1Val = _mm_loadu_ps(bPtr + 4);
181 b2Val = _mm_loadu_ps(bPtr + 8);
182 b3Val = _mm_loadu_ps(bPtr + 12);
183
184 c0Val = _mm_mul_ps(a0Val, b0Val);
185 c1Val = _mm_mul_ps(a1Val, b1Val);
186 c2Val = _mm_mul_ps(a2Val, b2Val);
187 c3Val = _mm_mul_ps(a3Val, b3Val);
188
189 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
190 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
191 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
192 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
193
194 aPtr += 8;
195 bPtr += 16;
196 }
197
198 _mm_empty(); // clear the mmx technology state
199
200 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203
204 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
205
206 _mm_store_ps(dotProductVector,
207 dotProdVal0); // Store the results back into the dot product vector
208
209 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
210 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
211
212 number = eighthPoints * 8;
213 for (; number < num_points; number++) {
214 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
215 aPtr += 1;
216 bPtr += 2;
217 }
218
219 *result = returnValue;
220}
221
222#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
223
224
225#if LV_HAVE_AVX2 && LV_HAVE_FMA
226
227static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
228 const short* input,
229 const lv_32fc_t* taps,
230 unsigned int num_points)
231{
232
233 unsigned int number = 0;
234 const unsigned int sixteenthPoints = num_points / 16;
235
236 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
237 const short* aPtr = input;
238 const float* bPtr = (float*)taps;
239
240 __m128i m0, m1;
241 __m256i f0, f1;
242 __m256 g0, g1, h0, h1, h2, h3;
243 __m256 a0Val, a1Val, a2Val, a3Val;
244 __m256 b0Val, b1Val, b2Val, b3Val;
245
246 __m256 dotProdVal0 = _mm256_setzero_ps();
247 __m256 dotProdVal1 = _mm256_setzero_ps();
248 __m256 dotProdVal2 = _mm256_setzero_ps();
249 __m256 dotProdVal3 = _mm256_setzero_ps();
250
251 for (; number < sixteenthPoints; number++) {
252
253 m0 = _mm_loadu_si128((__m128i const*)aPtr);
254 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
255
256 f0 = _mm256_cvtepi16_epi32(m0);
257 g0 = _mm256_cvtepi32_ps(f0);
258 f1 = _mm256_cvtepi16_epi32(m1);
259 g1 = _mm256_cvtepi32_ps(f1);
260
261 h0 = _mm256_unpacklo_ps(g0, g0);
262 h1 = _mm256_unpackhi_ps(g0, g0);
263 h2 = _mm256_unpacklo_ps(g1, g1);
264 h3 = _mm256_unpackhi_ps(g1, g1);
265
266 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
267 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
268 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
269 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
270
271 b0Val = _mm256_loadu_ps(bPtr);
272 b1Val = _mm256_loadu_ps(bPtr + 8);
273 b2Val = _mm256_loadu_ps(bPtr + 16);
274 b3Val = _mm256_loadu_ps(bPtr + 24);
275
276 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
277 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
278 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
279 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
280
281 aPtr += 16;
282 bPtr += 32;
283 }
284
285 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
287 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
288
289 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
290
291 _mm256_store_ps(dotProductVector,
292 dotProdVal0); // Store the results back into the dot product vector
293
294 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
295 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
296 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
297 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
298
299 number = sixteenthPoints * 16;
300 for (; number < num_points; number++) {
301 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
302 aPtr += 1;
303 bPtr += 2;
304 }
305
306 *result = returnValue;
307}
308
309#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
310
311
312#ifdef LV_HAVE_AVX2
313
314static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
315 const short* input,
316 const lv_32fc_t* taps,
317 unsigned int num_points)
318{
319
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
322
323 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
324 const short* aPtr = input;
325 const float* bPtr = (float*)taps;
326
327 __m128i m0, m1;
328 __m256i f0, f1;
329 __m256 g0, g1, h0, h1, h2, h3;
330 __m256 a0Val, a1Val, a2Val, a3Val;
331 __m256 b0Val, b1Val, b2Val, b3Val;
332 __m256 c0Val, c1Val, c2Val, c3Val;
333
334 __m256 dotProdVal0 = _mm256_setzero_ps();
335 __m256 dotProdVal1 = _mm256_setzero_ps();
336 __m256 dotProdVal2 = _mm256_setzero_ps();
337 __m256 dotProdVal3 = _mm256_setzero_ps();
338
339 for (; number < sixteenthPoints; number++) {
340
341 m0 = _mm_loadu_si128((__m128i const*)aPtr);
342 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
343
344 f0 = _mm256_cvtepi16_epi32(m0);
345 g0 = _mm256_cvtepi32_ps(f0);
346 f1 = _mm256_cvtepi16_epi32(m1);
347 g1 = _mm256_cvtepi32_ps(f1);
348
349 h0 = _mm256_unpacklo_ps(g0, g0);
350 h1 = _mm256_unpackhi_ps(g0, g0);
351 h2 = _mm256_unpacklo_ps(g1, g1);
352 h3 = _mm256_unpackhi_ps(g1, g1);
353
354 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
355 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
356 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
357 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
358
359 b0Val = _mm256_loadu_ps(bPtr);
360 b1Val = _mm256_loadu_ps(bPtr + 8);
361 b2Val = _mm256_loadu_ps(bPtr + 16);
362 b3Val = _mm256_loadu_ps(bPtr + 24);
363
364 c0Val = _mm256_mul_ps(a0Val, b0Val);
365 c1Val = _mm256_mul_ps(a1Val, b1Val);
366 c2Val = _mm256_mul_ps(a2Val, b2Val);
367 c3Val = _mm256_mul_ps(a3Val, b3Val);
368
369 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
370 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
371 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
372 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
373
374 aPtr += 16;
375 bPtr += 32;
376 }
377
378 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
379 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
380 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
381
382 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
383
384 _mm256_store_ps(dotProductVector,
385 dotProdVal0); // Store the results back into the dot product vector
386
387 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
388 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
389 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
390 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
391
392 number = sixteenthPoints * 16;
393 for (; number < num_points; number++) {
394 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
395 aPtr += 1;
396 bPtr += 2;
397 }
398
399 *result = returnValue;
400}
401
402#endif /*LV_HAVE_AVX2*/
403
404
405#if LV_HAVE_SSE && LV_HAVE_MMX
406
407
408static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
409 const short* input,
410 const lv_32fc_t* taps,
411 unsigned int num_points)
412{
413
414 unsigned int number = 0;
415 const unsigned int eighthPoints = num_points / 8;
416
417 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
418 const short* aPtr = input;
419 const float* bPtr = (float*)taps;
420
421 __m64 m0, m1;
422 __m128 f0, f1, f2, f3;
423 __m128 a0Val, a1Val, a2Val, a3Val;
424 __m128 b0Val, b1Val, b2Val, b3Val;
425 __m128 c0Val, c1Val, c2Val, c3Val;
426
427 __m128 dotProdVal0 = _mm_setzero_ps();
428 __m128 dotProdVal1 = _mm_setzero_ps();
429 __m128 dotProdVal2 = _mm_setzero_ps();
430 __m128 dotProdVal3 = _mm_setzero_ps();
431
432 for (; number < eighthPoints; number++) {
433
434 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
435 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
436 f0 = _mm_cvtpi16_ps(m0);
437 f1 = _mm_cvtpi16_ps(m0);
438 f2 = _mm_cvtpi16_ps(m1);
439 f3 = _mm_cvtpi16_ps(m1);
440
441 a0Val = _mm_unpacklo_ps(f0, f1);
442 a1Val = _mm_unpackhi_ps(f0, f1);
443 a2Val = _mm_unpacklo_ps(f2, f3);
444 a3Val = _mm_unpackhi_ps(f2, f3);
445
446 b0Val = _mm_load_ps(bPtr);
447 b1Val = _mm_load_ps(bPtr + 4);
448 b2Val = _mm_load_ps(bPtr + 8);
449 b3Val = _mm_load_ps(bPtr + 12);
450
451 c0Val = _mm_mul_ps(a0Val, b0Val);
452 c1Val = _mm_mul_ps(a1Val, b1Val);
453 c2Val = _mm_mul_ps(a2Val, b2Val);
454 c3Val = _mm_mul_ps(a3Val, b3Val);
455
456 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
457 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
458 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
459 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
460
461 aPtr += 8;
462 bPtr += 16;
463 }
464
465 _mm_empty(); // clear the mmx technology state
466
467 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
468 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
469 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
470
471 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
472
473 _mm_store_ps(dotProductVector,
474 dotProdVal0); // Store the results back into the dot product vector
475
476 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
477 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
478
479 number = eighthPoints * 8;
480 for (; number < num_points; number++) {
481 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
482 aPtr += 1;
483 bPtr += 2;
484 }
485
486 *result = returnValue;
487}
488
489#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
490
491#ifdef LV_HAVE_AVX2
492
493static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
494 const short* input,
495 const lv_32fc_t* taps,
496 unsigned int num_points)
497{
498
499 unsigned int number = 0;
500 const unsigned int sixteenthPoints = num_points / 16;
501
502 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
503 const short* aPtr = input;
504 const float* bPtr = (float*)taps;
505
506 __m128i m0, m1;
507 __m256i f0, f1;
508 __m256 g0, g1, h0, h1, h2, h3;
509 __m256 a0Val, a1Val, a2Val, a3Val;
510 __m256 b0Val, b1Val, b2Val, b3Val;
511 __m256 c0Val, c1Val, c2Val, c3Val;
512
513 __m256 dotProdVal0 = _mm256_setzero_ps();
514 __m256 dotProdVal1 = _mm256_setzero_ps();
515 __m256 dotProdVal2 = _mm256_setzero_ps();
516 __m256 dotProdVal3 = _mm256_setzero_ps();
517
518 for (; number < sixteenthPoints; number++) {
519
520 m0 = _mm_load_si128((__m128i const*)aPtr);
521 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
522
523 f0 = _mm256_cvtepi16_epi32(m0);
524 g0 = _mm256_cvtepi32_ps(f0);
525 f1 = _mm256_cvtepi16_epi32(m1);
526 g1 = _mm256_cvtepi32_ps(f1);
527
528 h0 = _mm256_unpacklo_ps(g0, g0);
529 h1 = _mm256_unpackhi_ps(g0, g0);
530 h2 = _mm256_unpacklo_ps(g1, g1);
531 h3 = _mm256_unpackhi_ps(g1, g1);
532
533 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
534 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
535 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
536 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
537
538 b0Val = _mm256_load_ps(bPtr);
539 b1Val = _mm256_load_ps(bPtr + 8);
540 b2Val = _mm256_load_ps(bPtr + 16);
541 b3Val = _mm256_load_ps(bPtr + 24);
542
543 c0Val = _mm256_mul_ps(a0Val, b0Val);
544 c1Val = _mm256_mul_ps(a1Val, b1Val);
545 c2Val = _mm256_mul_ps(a2Val, b2Val);
546 c3Val = _mm256_mul_ps(a3Val, b3Val);
547
548 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
549 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
550 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
551 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
552
553 aPtr += 16;
554 bPtr += 32;
555 }
556
557 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
559 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
560
561 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
562
563 _mm256_store_ps(dotProductVector,
564 dotProdVal0); // Store the results back into the dot product vector
565
566 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
567 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
568 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
569 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
570
571 number = sixteenthPoints * 16;
572 for (; number < num_points; number++) {
573 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
574 aPtr += 1;
575 bPtr += 2;
576 }
577
578 *result = returnValue;
579}
580
581
582#endif /*LV_HAVE_AVX2*/
583
584#if LV_HAVE_AVX2 && LV_HAVE_FMA
585
586static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
587 const short* input,
588 const lv_32fc_t* taps,
589 unsigned int num_points)
590{
591
592 unsigned int number = 0;
593 const unsigned int sixteenthPoints = num_points / 16;
594
595 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
596 const short* aPtr = input;
597 const float* bPtr = (float*)taps;
598
599 __m128i m0, m1;
600 __m256i f0, f1;
601 __m256 g0, g1, h0, h1, h2, h3;
602 __m256 a0Val, a1Val, a2Val, a3Val;
603 __m256 b0Val, b1Val, b2Val, b3Val;
604
605 __m256 dotProdVal0 = _mm256_setzero_ps();
606 __m256 dotProdVal1 = _mm256_setzero_ps();
607 __m256 dotProdVal2 = _mm256_setzero_ps();
608 __m256 dotProdVal3 = _mm256_setzero_ps();
609
610 for (; number < sixteenthPoints; number++) {
611
612 m0 = _mm_load_si128((__m128i const*)aPtr);
613 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
614
615 f0 = _mm256_cvtepi16_epi32(m0);
616 g0 = _mm256_cvtepi32_ps(f0);
617 f1 = _mm256_cvtepi16_epi32(m1);
618 g1 = _mm256_cvtepi32_ps(f1);
619
620 h0 = _mm256_unpacklo_ps(g0, g0);
621 h1 = _mm256_unpackhi_ps(g0, g0);
622 h2 = _mm256_unpacklo_ps(g1, g1);
623 h3 = _mm256_unpackhi_ps(g1, g1);
624
625 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
626 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
627 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
628 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
629
630 b0Val = _mm256_load_ps(bPtr);
631 b1Val = _mm256_load_ps(bPtr + 8);
632 b2Val = _mm256_load_ps(bPtr + 16);
633 b3Val = _mm256_load_ps(bPtr + 24);
634
635 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
636 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
637 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
638 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
639
640 aPtr += 16;
641 bPtr += 32;
642 }
643
644 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
645 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
646 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
647
648 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
649
650 _mm256_store_ps(dotProductVector,
651 dotProdVal0); // Store the results back into the dot product vector
652
653 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
654 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
655 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
656 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
657
658 number = sixteenthPoints * 16;
659 for (; number < num_points; number++) {
660 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
661 aPtr += 1;
662 bPtr += 2;
663 }
664
665 *result = returnValue;
666}
667
668
669#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
670
671#ifdef LV_HAVE_RVV
672#include <riscv_vector.h>
674
675static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
676 const short* input,
677 const lv_32fc_t* taps,
678 unsigned int num_points)
679{
680 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
681 vfloat32m4_t vsumi = vsumr;
682 size_t n = num_points;
683 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
684 vl = __riscv_vsetvl_e32m4(n);
685 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
686 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
687 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
688 vfloat32m4_t v =
689 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
690 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
691 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
692 }
693 size_t vl = __riscv_vsetvlmax_e32m1();
694 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
695 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
696 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
697 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
698 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
699}
700#endif /*LV_HAVE_RVV*/
701
702#ifdef LV_HAVE_RVVSEG
703#include <riscv_vector.h>
705
706static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
707 const short* input,
708 const lv_32fc_t* taps,
709 unsigned int num_points)
710{
711 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
712 vfloat32m4_t vsumi = vsumr;
713 size_t n = num_points;
714 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
715 vl = __riscv_vsetvl_e32m4(n);
716 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
717 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
718 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
719 vfloat32m4_t v =
720 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
721 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
722 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
723 }
724 size_t vl = __riscv_vsetvlmax_e32m1();
725 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
726 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
727 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
728 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
729 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
730}
731#endif /*LV_HAVE_RVVSEG*/
732
733#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_16i_32fc_dot_prod_32fc.h:88
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_16i_32fc_dot_prod_32fc.h:54
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cmake(r, i)
Definition volk_complex.h:77
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24