Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
46#define INCLUDED_volk_32f_x2_dot_prod_16i_H
47
48#include <stdio.h>
49#include <volk/volk_common.h>
50
51
52#ifdef LV_HAVE_GENERIC
53
54
55static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
56 const float* input,
57 const float* taps,
58 unsigned int num_points)
59{
60
61 float dotProduct = 0;
62 const float* aPtr = input;
63 const float* bPtr = taps;
64 unsigned int number = 0;
65
66 for (number = 0; number < num_points; number++) {
67 dotProduct += ((*aPtr++) * (*bPtr++));
68 }
69
70 *result = (int16_t)rintf(dotProduct);
71}
72
73#endif /*LV_HAVE_GENERIC*/
74
75
76#ifdef LV_HAVE_SSE
77
78static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
79 const float* input,
80 const float* taps,
81 unsigned int num_points)
82{
83
84 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
86
87 float dotProduct = 0;
88 const float* aPtr = input;
89 const float* bPtr = taps;
90
91 __m128 a0Val, a1Val, a2Val, a3Val;
92 __m128 b0Val, b1Val, b2Val, b3Val;
93 __m128 c0Val, c1Val, c2Val, c3Val;
94
95 __m128 dotProdVal0 = _mm_setzero_ps();
96 __m128 dotProdVal1 = _mm_setzero_ps();
97 __m128 dotProdVal2 = _mm_setzero_ps();
98 __m128 dotProdVal3 = _mm_setzero_ps();
99
100 for (; number < sixteenthPoints; number++) {
101
102 a0Val = _mm_load_ps(aPtr);
103 a1Val = _mm_load_ps(aPtr + 4);
104 a2Val = _mm_load_ps(aPtr + 8);
105 a3Val = _mm_load_ps(aPtr + 12);
106 b0Val = _mm_load_ps(bPtr);
107 b1Val = _mm_load_ps(bPtr + 4);
108 b2Val = _mm_load_ps(bPtr + 8);
109 b3Val = _mm_load_ps(bPtr + 12);
110
111 c0Val = _mm_mul_ps(a0Val, b0Val);
112 c1Val = _mm_mul_ps(a1Val, b1Val);
113 c2Val = _mm_mul_ps(a2Val, b2Val);
114 c3Val = _mm_mul_ps(a3Val, b3Val);
115
116 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
117 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
118 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
119 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
120
121 aPtr += 16;
122 bPtr += 16;
123 }
124
125 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
126 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
127 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
128
129 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
130
131 _mm_store_ps(dotProductVector,
132 dotProdVal0); // Store the results back into the dot product vector
133
134 dotProduct = dotProductVector[0];
135 dotProduct += dotProductVector[1];
136 dotProduct += dotProductVector[2];
137 dotProduct += dotProductVector[3];
138
139 number = sixteenthPoints * 16;
140 for (; number < num_points; number++) {
141 dotProduct += ((*aPtr++) * (*bPtr++));
142 }
143
144 *result = (short)rintf(dotProduct);
145}
146
147#endif /*LV_HAVE_SSE*/
148
149
150#if LV_HAVE_AVX2 && LV_HAVE_FMA
151
152static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
153 const float* input,
154 const float* taps,
155 unsigned int num_points)
156{
157
158 unsigned int number = 0;
159 const unsigned int thirtysecondPoints = num_points / 32;
160
161 float dotProduct = 0;
162 const float* aPtr = input;
163 const float* bPtr = taps;
164
165 __m256 a0Val, a1Val, a2Val, a3Val;
166 __m256 b0Val, b1Val, b2Val, b3Val;
167
168 __m256 dotProdVal0 = _mm256_setzero_ps();
169 __m256 dotProdVal1 = _mm256_setzero_ps();
170 __m256 dotProdVal2 = _mm256_setzero_ps();
171 __m256 dotProdVal3 = _mm256_setzero_ps();
172
173 for (; number < thirtysecondPoints; number++) {
174
175 a0Val = _mm256_load_ps(aPtr);
176 a1Val = _mm256_load_ps(aPtr + 8);
177 a2Val = _mm256_load_ps(aPtr + 16);
178 a3Val = _mm256_load_ps(aPtr + 24);
179 b0Val = _mm256_load_ps(bPtr);
180 b1Val = _mm256_load_ps(bPtr + 8);
181 b2Val = _mm256_load_ps(bPtr + 16);
182 b3Val = _mm256_load_ps(bPtr + 24);
183
184 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
185 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
186 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
187 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
188
189 aPtr += 32;
190 bPtr += 32;
191 }
192
193 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
194 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
195 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
196
197 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
198
199 _mm256_store_ps(dotProductVector,
200 dotProdVal0); // Store the results back into the dot product vector
201
202 dotProduct = dotProductVector[0];
203 dotProduct += dotProductVector[1];
204 dotProduct += dotProductVector[2];
205 dotProduct += dotProductVector[3];
206 dotProduct += dotProductVector[4];
207 dotProduct += dotProductVector[5];
208 dotProduct += dotProductVector[6];
209 dotProduct += dotProductVector[7];
210
211 number = thirtysecondPoints * 32;
212 for (; number < num_points; number++) {
213 dotProduct += ((*aPtr++) * (*bPtr++));
214 }
215
216 *result = (short)rintf(dotProduct);
217}
218
219#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
220
221
222#ifdef LV_HAVE_AVX
223
224static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
225 const float* input,
226 const float* taps,
227 unsigned int num_points)
228{
229
230 unsigned int number = 0;
231 const unsigned int thirtysecondPoints = num_points / 32;
232
233 float dotProduct = 0;
234 const float* aPtr = input;
235 const float* bPtr = taps;
236
237 __m256 a0Val, a1Val, a2Val, a3Val;
238 __m256 b0Val, b1Val, b2Val, b3Val;
239 __m256 c0Val, c1Val, c2Val, c3Val;
240
241 __m256 dotProdVal0 = _mm256_setzero_ps();
242 __m256 dotProdVal1 = _mm256_setzero_ps();
243 __m256 dotProdVal2 = _mm256_setzero_ps();
244 __m256 dotProdVal3 = _mm256_setzero_ps();
245
246 for (; number < thirtysecondPoints; number++) {
247
248 a0Val = _mm256_load_ps(aPtr);
249 a1Val = _mm256_load_ps(aPtr + 8);
250 a2Val = _mm256_load_ps(aPtr + 16);
251 a3Val = _mm256_load_ps(aPtr + 24);
252 b0Val = _mm256_load_ps(bPtr);
253 b1Val = _mm256_load_ps(bPtr + 8);
254 b2Val = _mm256_load_ps(bPtr + 16);
255 b3Val = _mm256_load_ps(bPtr + 24);
256
257 c0Val = _mm256_mul_ps(a0Val, b0Val);
258 c1Val = _mm256_mul_ps(a1Val, b1Val);
259 c2Val = _mm256_mul_ps(a2Val, b2Val);
260 c3Val = _mm256_mul_ps(a3Val, b3Val);
261
262 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
263 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
264 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
265 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
266
267 aPtr += 32;
268 bPtr += 32;
269 }
270
271 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
272 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
273 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
274
275 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
276
277 _mm256_store_ps(dotProductVector,
278 dotProdVal0); // Store the results back into the dot product vector
279
280 dotProduct = dotProductVector[0];
281 dotProduct += dotProductVector[1];
282 dotProduct += dotProductVector[2];
283 dotProduct += dotProductVector[3];
284 dotProduct += dotProductVector[4];
285 dotProduct += dotProductVector[5];
286 dotProduct += dotProductVector[6];
287 dotProduct += dotProductVector[7];
288
289 number = thirtysecondPoints * 32;
290 for (; number < num_points; number++) {
291 dotProduct += ((*aPtr++) * (*bPtr++));
292 }
293
294 *result = (short)rintf(dotProduct);
295}
296
297#endif /*LV_HAVE_AVX*/
298
299#ifdef LV_HAVE_AVX512F
300
301static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
302 const float* input,
303 const float* taps,
304 unsigned int num_points)
305{
306
307 unsigned int number = 0;
308 const unsigned int sixtyfourthPoints = num_points / 64;
309
310 float dotProduct = 0;
311 const float* aPtr = input;
312 const float* bPtr = taps;
313
314 __m512 a0Val, a1Val, a2Val, a3Val;
315 __m512 b0Val, b1Val, b2Val, b3Val;
316
317 __m512 dotProdVal0 = _mm512_setzero_ps();
318 __m512 dotProdVal1 = _mm512_setzero_ps();
319 __m512 dotProdVal2 = _mm512_setzero_ps();
320 __m512 dotProdVal3 = _mm512_setzero_ps();
321
322 for (; number < sixtyfourthPoints; number++) {
323
324 a0Val = _mm512_load_ps(aPtr);
325 a1Val = _mm512_load_ps(aPtr + 16);
326 a2Val = _mm512_load_ps(aPtr + 32);
327 a3Val = _mm512_load_ps(aPtr + 48);
328 b0Val = _mm512_load_ps(bPtr);
329 b1Val = _mm512_load_ps(bPtr + 16);
330 b2Val = _mm512_load_ps(bPtr + 32);
331 b3Val = _mm512_load_ps(bPtr + 48);
332
333 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
334 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
335 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
336 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
337
338 aPtr += 64;
339 bPtr += 64;
340 }
341
342 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
343 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
344 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
345
346 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
347
348 _mm512_store_ps(dotProductVector,
349 dotProdVal0); // Store the results back into the dot product vector
350
351 dotProduct = dotProductVector[0];
352 dotProduct += dotProductVector[1];
353 dotProduct += dotProductVector[2];
354 dotProduct += dotProductVector[3];
355 dotProduct += dotProductVector[4];
356 dotProduct += dotProductVector[5];
357 dotProduct += dotProductVector[6];
358 dotProduct += dotProductVector[7];
359 dotProduct += dotProductVector[8];
360 dotProduct += dotProductVector[9];
361 dotProduct += dotProductVector[10];
362 dotProduct += dotProductVector[11];
363 dotProduct += dotProductVector[12];
364 dotProduct += dotProductVector[13];
365 dotProduct += dotProductVector[14];
366 dotProduct += dotProductVector[15];
367
368 number = sixtyfourthPoints * 64;
369 for (; number < num_points; number++) {
370 dotProduct += ((*aPtr++) * (*bPtr++));
371 }
372
373 *result = (short)rintf(dotProduct);
374}
375
376#endif /*LV_HAVE_AVX512F*/
377
378
379#ifdef LV_HAVE_SSE
380
381static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
382 const float* input,
383 const float* taps,
384 unsigned int num_points)
385{
386
387 unsigned int number = 0;
388 const unsigned int sixteenthPoints = num_points / 16;
389
390 float dotProduct = 0;
391 const float* aPtr = input;
392 const float* bPtr = taps;
393
394 __m128 a0Val, a1Val, a2Val, a3Val;
395 __m128 b0Val, b1Val, b2Val, b3Val;
396 __m128 c0Val, c1Val, c2Val, c3Val;
397
398 __m128 dotProdVal0 = _mm_setzero_ps();
399 __m128 dotProdVal1 = _mm_setzero_ps();
400 __m128 dotProdVal2 = _mm_setzero_ps();
401 __m128 dotProdVal3 = _mm_setzero_ps();
402
403 for (; number < sixteenthPoints; number++) {
404
405 a0Val = _mm_loadu_ps(aPtr);
406 a1Val = _mm_loadu_ps(aPtr + 4);
407 a2Val = _mm_loadu_ps(aPtr + 8);
408 a3Val = _mm_loadu_ps(aPtr + 12);
409 b0Val = _mm_loadu_ps(bPtr);
410 b1Val = _mm_loadu_ps(bPtr + 4);
411 b2Val = _mm_loadu_ps(bPtr + 8);
412 b3Val = _mm_loadu_ps(bPtr + 12);
413
414 c0Val = _mm_mul_ps(a0Val, b0Val);
415 c1Val = _mm_mul_ps(a1Val, b1Val);
416 c2Val = _mm_mul_ps(a2Val, b2Val);
417 c3Val = _mm_mul_ps(a3Val, b3Val);
418
419 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
420 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
421 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
422 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
423
424 aPtr += 16;
425 bPtr += 16;
426 }
427
428 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
429 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
430 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
431
432 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
433
434 _mm_store_ps(dotProductVector,
435 dotProdVal0); // Store the results back into the dot product vector
436
437 dotProduct = dotProductVector[0];
438 dotProduct += dotProductVector[1];
439 dotProduct += dotProductVector[2];
440 dotProduct += dotProductVector[3];
441
442 number = sixteenthPoints * 16;
443 for (; number < num_points; number++) {
444 dotProduct += ((*aPtr++) * (*bPtr++));
445 }
446
447 *result = (short)rintf(dotProduct);
448}
449
450#endif /*LV_HAVE_SSE*/
451
452
453#if LV_HAVE_AVX2 && LV_HAVE_FMA
454
455static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
456 const float* input,
457 const float* taps,
458 unsigned int num_points)
459{
460
461 unsigned int number = 0;
462 const unsigned int thirtysecondPoints = num_points / 32;
463
464 float dotProduct = 0;
465 const float* aPtr = input;
466 const float* bPtr = taps;
467
468 __m256 a0Val, a1Val, a2Val, a3Val;
469 __m256 b0Val, b1Val, b2Val, b3Val;
470
471 __m256 dotProdVal0 = _mm256_setzero_ps();
472 __m256 dotProdVal1 = _mm256_setzero_ps();
473 __m256 dotProdVal2 = _mm256_setzero_ps();
474 __m256 dotProdVal3 = _mm256_setzero_ps();
475
476 for (; number < thirtysecondPoints; number++) {
477
478 a0Val = _mm256_loadu_ps(aPtr);
479 a1Val = _mm256_loadu_ps(aPtr + 8);
480 a2Val = _mm256_loadu_ps(aPtr + 16);
481 a3Val = _mm256_loadu_ps(aPtr + 24);
482 b0Val = _mm256_loadu_ps(bPtr);
483 b1Val = _mm256_loadu_ps(bPtr + 8);
484 b2Val = _mm256_loadu_ps(bPtr + 16);
485 b3Val = _mm256_loadu_ps(bPtr + 24);
486
487 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
488 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
489 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
490 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
491
492 aPtr += 32;
493 bPtr += 32;
494 }
495
496 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
497 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
498 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
499
500 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
501
502 _mm256_store_ps(dotProductVector,
503 dotProdVal0); // Store the results back into the dot product vector
504
505 dotProduct = dotProductVector[0];
506 dotProduct += dotProductVector[1];
507 dotProduct += dotProductVector[2];
508 dotProduct += dotProductVector[3];
509 dotProduct += dotProductVector[4];
510 dotProduct += dotProductVector[5];
511 dotProduct += dotProductVector[6];
512 dotProduct += dotProductVector[7];
513
514 number = thirtysecondPoints * 32;
515 for (; number < num_points; number++) {
516 dotProduct += ((*aPtr++) * (*bPtr++));
517 }
518
519 *result = (short)rintf(dotProduct);
520}
521
522#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
523
524
525#ifdef LV_HAVE_AVX
526
527static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
528 const float* input,
529 const float* taps,
530 unsigned int num_points)
531{
532
533 unsigned int number = 0;
534 const unsigned int thirtysecondPoints = num_points / 32;
535
536 float dotProduct = 0;
537 const float* aPtr = input;
538 const float* bPtr = taps;
539
540 __m256 a0Val, a1Val, a2Val, a3Val;
541 __m256 b0Val, b1Val, b2Val, b3Val;
542 __m256 c0Val, c1Val, c2Val, c3Val;
543
544 __m256 dotProdVal0 = _mm256_setzero_ps();
545 __m256 dotProdVal1 = _mm256_setzero_ps();
546 __m256 dotProdVal2 = _mm256_setzero_ps();
547 __m256 dotProdVal3 = _mm256_setzero_ps();
548
549 for (; number < thirtysecondPoints; number++) {
550
551 a0Val = _mm256_loadu_ps(aPtr);
552 a1Val = _mm256_loadu_ps(aPtr + 8);
553 a2Val = _mm256_loadu_ps(aPtr + 16);
554 a3Val = _mm256_loadu_ps(aPtr + 24);
555 b0Val = _mm256_loadu_ps(bPtr);
556 b1Val = _mm256_loadu_ps(bPtr + 8);
557 b2Val = _mm256_loadu_ps(bPtr + 16);
558 b3Val = _mm256_loadu_ps(bPtr + 24);
559
560 c0Val = _mm256_mul_ps(a0Val, b0Val);
561 c1Val = _mm256_mul_ps(a1Val, b1Val);
562 c2Val = _mm256_mul_ps(a2Val, b2Val);
563 c3Val = _mm256_mul_ps(a3Val, b3Val);
564
565 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
566 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
567 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
568 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
569
570 aPtr += 32;
571 bPtr += 32;
572 }
573
574 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
575 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
576 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
577
578 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
579
580 _mm256_store_ps(dotProductVector,
581 dotProdVal0); // Store the results back into the dot product vector
582
583 dotProduct = dotProductVector[0];
584 dotProduct += dotProductVector[1];
585 dotProduct += dotProductVector[2];
586 dotProduct += dotProductVector[3];
587 dotProduct += dotProductVector[4];
588 dotProduct += dotProductVector[5];
589 dotProduct += dotProductVector[6];
590 dotProduct += dotProductVector[7];
591
592 number = thirtysecondPoints * 32;
593 for (; number < num_points; number++) {
594 dotProduct += ((*aPtr++) * (*bPtr++));
595 }
596
597 *result = (short)rintf(dotProduct);
598}
599
600#endif /*LV_HAVE_AVX*/
601
602#ifdef LV_HAVE_AVX512F
603
604static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
605 const float* input,
606 const float* taps,
607 unsigned int num_points)
608{
609
610 unsigned int number = 0;
611 const unsigned int sixtyfourthPoints = num_points / 64;
612
613 float dotProduct = 0;
614 const float* aPtr = input;
615 const float* bPtr = taps;
616
617 __m512 a0Val, a1Val, a2Val, a3Val;
618 __m512 b0Val, b1Val, b2Val, b3Val;
619
620 __m512 dotProdVal0 = _mm512_setzero_ps();
621 __m512 dotProdVal1 = _mm512_setzero_ps();
622 __m512 dotProdVal2 = _mm512_setzero_ps();
623 __m512 dotProdVal3 = _mm512_setzero_ps();
624
625 for (; number < sixtyfourthPoints; number++) {
626
627 a0Val = _mm512_loadu_ps(aPtr);
628 a1Val = _mm512_loadu_ps(aPtr + 16);
629 a2Val = _mm512_loadu_ps(aPtr + 32);
630 a3Val = _mm512_loadu_ps(aPtr + 48);
631 b0Val = _mm512_loadu_ps(bPtr);
632 b1Val = _mm512_loadu_ps(bPtr + 16);
633 b2Val = _mm512_loadu_ps(bPtr + 32);
634 b3Val = _mm512_loadu_ps(bPtr + 48);
635
636 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
637 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
638 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
639 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
640
641 aPtr += 64;
642 bPtr += 64;
643 }
644
645 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
646 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
647 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
648
649 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
650
651 _mm512_storeu_ps(dotProductVector,
652 dotProdVal0); // Store the results back into the dot product vector
653
654 dotProduct = dotProductVector[0];
655 dotProduct += dotProductVector[1];
656 dotProduct += dotProductVector[2];
657 dotProduct += dotProductVector[3];
658 dotProduct += dotProductVector[4];
659 dotProduct += dotProductVector[5];
660 dotProduct += dotProductVector[6];
661 dotProduct += dotProductVector[7];
662 dotProduct += dotProductVector[8];
663 dotProduct += dotProductVector[9];
664 dotProduct += dotProductVector[10];
665 dotProduct += dotProductVector[11];
666 dotProduct += dotProductVector[12];
667 dotProduct += dotProductVector[13];
668 dotProduct += dotProductVector[14];
669 dotProduct += dotProductVector[15];
670
671 number = sixtyfourthPoints * 64;
672 for (; number < num_points; number++) {
673 dotProduct += ((*aPtr++) * (*bPtr++));
674 }
675
676 *result = (short)rintf(dotProduct);
677}
678
679#endif /*LV_HAVE_AVX512F*/
680
681#ifdef LV_HAVE_RVV
682#include <riscv_vector.h>
683
685
686static inline void volk_32f_x2_dot_prod_16i_rvv(int16_t* result,
687 const float* input,
688 const float* taps,
689 unsigned int num_points)
690{
691 float fresult = 0;
692 volk_32f_x2_dot_prod_32f_rvv(&fresult, input, taps, num_points);
693 *result = (int16_t)rintf(fresult);
694}
695#endif /*LV_HAVE_RVV*/
696
697#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
static float rintf(float x)
Definition config.h:45
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_16i.h:381
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_16i.h:55
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_16i.h:224
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_16i.h:527
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_16i.h:78
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62