58#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
71 unsigned int num_points)
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
95 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
101 float dotProduct = 0;
102 const float* aPtr = input;
103 const float* bPtr = taps;
105 __m128 a0Val, a1Val, a2Val, a3Val;
106 __m128 b0Val, b1Val, b2Val, b3Val;
107 __m128 c0Val, c1Val, c2Val, c3Val;
109 __m128 dotProdVal0 = _mm_setzero_ps();
110 __m128 dotProdVal1 = _mm_setzero_ps();
111 __m128 dotProdVal2 = _mm_setzero_ps();
112 __m128 dotProdVal3 = _mm_setzero_ps();
114 for (; number < sixteenthPoints; number++) {
116 a0Val = _mm_loadu_ps(aPtr);
117 a1Val = _mm_loadu_ps(aPtr + 4);
118 a2Val = _mm_loadu_ps(aPtr + 8);
119 a3Val = _mm_loadu_ps(aPtr + 12);
120 b0Val = _mm_loadu_ps(bPtr);
121 b1Val = _mm_loadu_ps(bPtr + 4);
122 b2Val = _mm_loadu_ps(bPtr + 8);
123 b3Val = _mm_loadu_ps(bPtr + 12);
125 c0Val = _mm_mul_ps(a0Val, b0Val);
126 c1Val = _mm_mul_ps(a1Val, b1Val);
127 c2Val = _mm_mul_ps(a2Val, b2Val);
128 c3Val = _mm_mul_ps(a3Val, b3Val);
130 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
139 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
145 _mm_store_ps(dotProductVector,
148 dotProduct = dotProductVector[0];
149 dotProduct += dotProductVector[1];
150 dotProduct += dotProductVector[2];
151 dotProduct += dotProductVector[3];
153 number = sixteenthPoints * 16;
154 for (; number < num_points; number++) {
155 dotProduct += ((*aPtr++) * (*bPtr++));
158 *result = dotProduct;
165#include <pmmintrin.h>
170 unsigned int num_points)
172 unsigned int number = 0;
173 const unsigned int sixteenthPoints = num_points / 16;
175 float dotProduct = 0;
176 const float* aPtr = input;
177 const float* bPtr = taps;
179 __m128 a0Val, a1Val, a2Val, a3Val;
180 __m128 b0Val, b1Val, b2Val, b3Val;
181 __m128 c0Val, c1Val, c2Val, c3Val;
183 __m128 dotProdVal0 = _mm_setzero_ps();
184 __m128 dotProdVal1 = _mm_setzero_ps();
185 __m128 dotProdVal2 = _mm_setzero_ps();
186 __m128 dotProdVal3 = _mm_setzero_ps();
188 for (; number < sixteenthPoints; number++) {
190 a0Val = _mm_loadu_ps(aPtr);
191 a1Val = _mm_loadu_ps(aPtr + 4);
192 a2Val = _mm_loadu_ps(aPtr + 8);
193 a3Val = _mm_loadu_ps(aPtr + 12);
194 b0Val = _mm_loadu_ps(bPtr);
195 b1Val = _mm_loadu_ps(bPtr + 4);
196 b2Val = _mm_loadu_ps(bPtr + 8);
197 b3Val = _mm_loadu_ps(bPtr + 12);
199 c0Val = _mm_mul_ps(a0Val, b0Val);
200 c1Val = _mm_mul_ps(a1Val, b1Val);
201 c2Val = _mm_mul_ps(a2Val, b2Val);
202 c3Val = _mm_mul_ps(a3Val, b3Val);
204 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
213 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
218 _mm_store_ps(dotProductVector,
221 dotProduct = dotProductVector[0];
222 dotProduct += dotProductVector[1];
223 dotProduct += dotProductVector[2];
224 dotProduct += dotProductVector[3];
226 number = sixteenthPoints * 16;
227 for (; number < num_points; number++) {
228 dotProduct += ((*aPtr++) * (*bPtr++));
231 *result = dotProduct;
238#include <smmintrin.h>
240static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float* result,
243 unsigned int num_points)
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
248 float dotProduct = 0;
249 const float* aPtr = input;
250 const float* bPtr = taps;
252 __m128 aVal1, bVal1, cVal1;
253 __m128 aVal2, bVal2, cVal2;
254 __m128 aVal3, bVal3, cVal3;
255 __m128 aVal4, bVal4, cVal4;
257 __m128 dotProdVal = _mm_setzero_ps();
259 for (; number < sixteenthPoints; number++) {
261 aVal1 = _mm_loadu_ps(aPtr);
263 aVal2 = _mm_loadu_ps(aPtr);
265 aVal3 = _mm_loadu_ps(aPtr);
267 aVal4 = _mm_loadu_ps(aPtr);
270 bVal1 = _mm_loadu_ps(bPtr);
272 bVal2 = _mm_loadu_ps(bPtr);
274 bVal3 = _mm_loadu_ps(bPtr);
276 bVal4 = _mm_loadu_ps(bPtr);
279 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
284 cVal1 = _mm_or_ps(cVal1, cVal2);
285 cVal3 = _mm_or_ps(cVal3, cVal4);
286 cVal1 = _mm_or_ps(cVal1, cVal3);
288 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
292 _mm_store_ps(dotProductVector,
295 dotProduct = dotProductVector[0];
296 dotProduct += dotProductVector[1];
297 dotProduct += dotProductVector[2];
298 dotProduct += dotProductVector[3];
300 number = sixteenthPoints * 16;
301 for (; number < num_points; number++) {
302 dotProduct += ((*aPtr++) * (*bPtr++));
305 *result = dotProduct;
312#include <immintrin.h>
317 unsigned int num_points)
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
331 __m256 dotProdVal0 = _mm256_setzero_ps();
332 __m256 dotProdVal1 = _mm256_setzero_ps();
334 for (; number < sixteenthPoints; number++) {
336 a0Val = _mm256_loadu_ps(aPtr);
337 a1Val = _mm256_loadu_ps(aPtr + 8);
338 b0Val = _mm256_loadu_ps(bPtr);
339 b1Val = _mm256_loadu_ps(bPtr + 8);
341 c0Val = _mm256_mul_ps(a0Val, b0Val);
342 c1Val = _mm256_mul_ps(a1Val, b1Val);
344 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
351 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
355 _mm256_storeu_ps(dotProductVector,
358 dotProduct = dotProductVector[0];
359 dotProduct += dotProductVector[1];
360 dotProduct += dotProductVector[2];
361 dotProduct += dotProductVector[3];
362 dotProduct += dotProductVector[4];
363 dotProduct += dotProductVector[5];
364 dotProduct += dotProductVector[6];
365 dotProduct += dotProductVector[7];
367 number = sixteenthPoints * 16;
368 for (; number < num_points; number++) {
369 dotProduct += ((*aPtr++) * (*bPtr++));
372 *result = dotProduct;
377#if LV_HAVE_AVX2 && LV_HAVE_FMA
378#include <immintrin.h>
379static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(
float* result,
382 unsigned int num_points)
385 const unsigned int eighthPoints = num_points / 8;
387 const float* aPtr = input;
388 const float* bPtr = taps;
390 __m256 dotProdVal = _mm256_setzero_ps();
393 for (number = 0; number < eighthPoints; number++) {
395 aVal1 = _mm256_loadu_ps(aPtr);
396 bVal1 = _mm256_loadu_ps(bPtr);
400 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
404 _mm256_storeu_ps(dotProductVector,
407 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409 dotProductVector[6] + dotProductVector[7];
411 for (number = eighthPoints * 8; number < num_points; number++) {
412 dotProduct += ((*aPtr++) * (*bPtr++));
415 *result = dotProduct;
420#include <immintrin.h>
421static inline void volk_32f_x2_dot_prod_32f_u_avx512f(
float* result,
424 unsigned int num_points)
427 const unsigned int sixteenthPoints = num_points / 16;
429 const float* aPtr = input;
430 const float* bPtr = taps;
432 __m512 dotProdVal = _mm512_setzero_ps();
435 for (number = 0; number < sixteenthPoints; number++) {
437 aVal1 = _mm512_loadu_ps(aPtr);
438 bVal1 = _mm512_loadu_ps(bPtr);
442 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
446 _mm512_storeu_ps(dotProductVector,
449 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453 dotProductVector[12] + dotProductVector[13] +
454 dotProductVector[14] + dotProductVector[15];
456 for (number = sixteenthPoints * 16; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
460 *result = dotProduct;
466#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
479 unsigned int num_points)
482 unsigned int number = 0;
483 const unsigned int sixteenthPoints = num_points / 16;
485 float dotProduct = 0;
486 const float* aPtr = input;
487 const float* bPtr = taps;
489 __m128 a0Val, a1Val, a2Val, a3Val;
490 __m128 b0Val, b1Val, b2Val, b3Val;
491 __m128 c0Val, c1Val, c2Val, c3Val;
493 __m128 dotProdVal0 = _mm_setzero_ps();
494 __m128 dotProdVal1 = _mm_setzero_ps();
495 __m128 dotProdVal2 = _mm_setzero_ps();
496 __m128 dotProdVal3 = _mm_setzero_ps();
498 for (; number < sixteenthPoints; number++) {
500 a0Val = _mm_load_ps(aPtr);
501 a1Val = _mm_load_ps(aPtr + 4);
502 a2Val = _mm_load_ps(aPtr + 8);
503 a3Val = _mm_load_ps(aPtr + 12);
504 b0Val = _mm_load_ps(bPtr);
505 b1Val = _mm_load_ps(bPtr + 4);
506 b2Val = _mm_load_ps(bPtr + 8);
507 b3Val = _mm_load_ps(bPtr + 12);
509 c0Val = _mm_mul_ps(a0Val, b0Val);
510 c1Val = _mm_mul_ps(a1Val, b1Val);
511 c2Val = _mm_mul_ps(a2Val, b2Val);
512 c3Val = _mm_mul_ps(a3Val, b3Val);
514 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
515 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
516 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
517 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
523 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
524 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
525 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
529 _mm_store_ps(dotProductVector,
532 dotProduct = dotProductVector[0];
533 dotProduct += dotProductVector[1];
534 dotProduct += dotProductVector[2];
535 dotProduct += dotProductVector[3];
537 number = sixteenthPoints * 16;
538 for (; number < num_points; number++) {
539 dotProduct += ((*aPtr++) * (*bPtr++));
542 *result = dotProduct;
549#include <pmmintrin.h>
554 unsigned int num_points)
556 unsigned int number = 0;
557 const unsigned int sixteenthPoints = num_points / 16;
559 float dotProduct = 0;
560 const float* aPtr = input;
561 const float* bPtr = taps;
563 __m128 a0Val, a1Val, a2Val, a3Val;
564 __m128 b0Val, b1Val, b2Val, b3Val;
565 __m128 c0Val, c1Val, c2Val, c3Val;
567 __m128 dotProdVal0 = _mm_setzero_ps();
568 __m128 dotProdVal1 = _mm_setzero_ps();
569 __m128 dotProdVal2 = _mm_setzero_ps();
570 __m128 dotProdVal3 = _mm_setzero_ps();
572 for (; number < sixteenthPoints; number++) {
574 a0Val = _mm_load_ps(aPtr);
575 a1Val = _mm_load_ps(aPtr + 4);
576 a2Val = _mm_load_ps(aPtr + 8);
577 a3Val = _mm_load_ps(aPtr + 12);
578 b0Val = _mm_load_ps(bPtr);
579 b1Val = _mm_load_ps(bPtr + 4);
580 b2Val = _mm_load_ps(bPtr + 8);
581 b3Val = _mm_load_ps(bPtr + 12);
583 c0Val = _mm_mul_ps(a0Val, b0Val);
584 c1Val = _mm_mul_ps(a1Val, b1Val);
585 c2Val = _mm_mul_ps(a2Val, b2Val);
586 c3Val = _mm_mul_ps(a3Val, b3Val);
588 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
589 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
590 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
591 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
597 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
598 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
599 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
602 _mm_store_ps(dotProductVector,
605 dotProduct = dotProductVector[0];
606 dotProduct += dotProductVector[1];
607 dotProduct += dotProductVector[2];
608 dotProduct += dotProductVector[3];
610 number = sixteenthPoints * 16;
611 for (; number < num_points; number++) {
612 dotProduct += ((*aPtr++) * (*bPtr++));
615 *result = dotProduct;
622#include <smmintrin.h>
624static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float* result,
627 unsigned int num_points)
629 unsigned int number = 0;
630 const unsigned int sixteenthPoints = num_points / 16;
632 float dotProduct = 0;
633 const float* aPtr = input;
634 const float* bPtr = taps;
636 __m128 aVal1, bVal1, cVal1;
637 __m128 aVal2, bVal2, cVal2;
638 __m128 aVal3, bVal3, cVal3;
639 __m128 aVal4, bVal4, cVal4;
641 __m128 dotProdVal = _mm_setzero_ps();
643 for (; number < sixteenthPoints; number++) {
645 aVal1 = _mm_load_ps(aPtr);
647 aVal2 = _mm_load_ps(aPtr);
649 aVal3 = _mm_load_ps(aPtr);
651 aVal4 = _mm_load_ps(aPtr);
654 bVal1 = _mm_load_ps(bPtr);
656 bVal2 = _mm_load_ps(bPtr);
658 bVal3 = _mm_load_ps(bPtr);
660 bVal4 = _mm_load_ps(bPtr);
663 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
664 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
665 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
666 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
668 cVal1 = _mm_or_ps(cVal1, cVal2);
669 cVal3 = _mm_or_ps(cVal3, cVal4);
670 cVal1 = _mm_or_ps(cVal1, cVal3);
672 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
676 _mm_store_ps(dotProductVector,
679 dotProduct = dotProductVector[0];
680 dotProduct += dotProductVector[1];
681 dotProduct += dotProductVector[2];
682 dotProduct += dotProductVector[3];
684 number = sixteenthPoints * 16;
685 for (; number < num_points; number++) {
686 dotProduct += ((*aPtr++) * (*bPtr++));
689 *result = dotProduct;
696#include <immintrin.h>
701 unsigned int num_points)
704 unsigned int number = 0;
705 const unsigned int sixteenthPoints = num_points / 16;
707 float dotProduct = 0;
708 const float* aPtr = input;
709 const float* bPtr = taps;
715 __m256 dotProdVal0 = _mm256_setzero_ps();
716 __m256 dotProdVal1 = _mm256_setzero_ps();
718 for (; number < sixteenthPoints; number++) {
720 a0Val = _mm256_load_ps(aPtr);
721 a1Val = _mm256_load_ps(aPtr + 8);
722 b0Val = _mm256_load_ps(bPtr);
723 b1Val = _mm256_load_ps(bPtr + 8);
725 c0Val = _mm256_mul_ps(a0Val, b0Val);
726 c1Val = _mm256_mul_ps(a1Val, b1Val);
728 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
729 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
735 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
739 _mm256_store_ps(dotProductVector,
742 dotProduct = dotProductVector[0];
743 dotProduct += dotProductVector[1];
744 dotProduct += dotProductVector[2];
745 dotProduct += dotProductVector[3];
746 dotProduct += dotProductVector[4];
747 dotProduct += dotProductVector[5];
748 dotProduct += dotProductVector[6];
749 dotProduct += dotProductVector[7];
751 number = sixteenthPoints * 16;
752 for (; number < num_points; number++) {
753 dotProduct += ((*aPtr++) * (*bPtr++));
756 *result = dotProduct;
761#if LV_HAVE_AVX2 && LV_HAVE_FMA
762#include <immintrin.h>
763static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(
float* result,
766 unsigned int num_points)
769 const unsigned int eighthPoints = num_points / 8;
771 const float* aPtr = input;
772 const float* bPtr = taps;
774 __m256 dotProdVal = _mm256_setzero_ps();
777 for (number = 0; number < eighthPoints; number++) {
779 aVal1 = _mm256_load_ps(aPtr);
780 bVal1 = _mm256_load_ps(bPtr);
784 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
788 _mm256_store_ps(dotProductVector,
791 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
792 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
793 dotProductVector[6] + dotProductVector[7];
795 for (number = eighthPoints * 8; number < num_points; number++) {
796 dotProduct += ((*aPtr++) * (*bPtr++));
799 *result = dotProduct;
804#include <immintrin.h>
805static inline void volk_32f_x2_dot_prod_32f_a_avx512f(
float* result,
808 unsigned int num_points)
811 const unsigned int sixteenthPoints = num_points / 16;
813 const float* aPtr = input;
814 const float* bPtr = taps;
816 __m512 dotProdVal = _mm512_setzero_ps();
819 for (number = 0; number < sixteenthPoints; number++) {
821 aVal1 = _mm512_load_ps(aPtr);
822 bVal1 = _mm512_load_ps(bPtr);
826 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
830 _mm512_store_ps(dotProductVector,
833 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
834 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
835 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
836 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
837 dotProductVector[12] + dotProductVector[13] +
838 dotProductVector[14] + dotProductVector[15];
840 for (number = sixteenthPoints * 16; number < num_points; number++) {
841 dotProduct += ((*aPtr++) * (*bPtr++));
844 *result = dotProduct;
854 unsigned int num_points)
857 unsigned int quarter_points = num_points / 16;
858 float dotProduct = 0;
859 const float* aPtr = input;
860 const float* bPtr = taps;
861 unsigned int number = 0;
863 float32x4x4_t a_val, b_val, accumulator0;
864 accumulator0.val[0] = vdupq_n_f32(0);
865 accumulator0.val[1] = vdupq_n_f32(0);
866 accumulator0.val[2] = vdupq_n_f32(0);
867 accumulator0.val[3] = vdupq_n_f32(0);
870 for (number = 0; number < quarter_points; ++number) {
871 a_val = vld4q_f32(aPtr);
872 b_val = vld4q_f32(bPtr);
873 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
874 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
875 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
876 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
880 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
881 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
882 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
884 vst1q_f32(accumulator, accumulator0.val[0]);
885 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
887 for (number = quarter_points * 16; number < num_points; number++) {
888 dotProduct += ((*aPtr++) * (*bPtr++));
891 *result = dotProduct;
901 unsigned int num_points)
904 unsigned int quarter_points = num_points / 8;
905 float dotProduct = 0;
906 const float* aPtr = input;
907 const float* bPtr = taps;
908 unsigned int number = 0;
910 float32x4x2_t a_val, b_val, accumulator_val;
911 accumulator_val.val[0] = vdupq_n_f32(0);
912 accumulator_val.val[1] = vdupq_n_f32(0);
914 for (number = 0; number < quarter_points; ++number) {
915 a_val = vld2q_f32(aPtr);
916 b_val = vld2q_f32(bPtr);
917 accumulator_val.val[0] =
918 vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
919 accumulator_val.val[1] =
920 vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
924 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
926 vst1q_f32(accumulator, accumulator_val.val[0]);
927 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
929 for (number = quarter_points * 8; number < num_points; number++) {
930 dotProduct += ((*aPtr++) * (*bPtr++));
933 *result = dotProduct;
939extern void volk_32f_x2_dot_prod_32f_a_neonasm(
float* cVector,
940 const float* aVector,
941 const float* bVector,
942 unsigned int num_points);
946extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(
float* cVector,
947 const float* aVector,
948 const float* bVector,
949 unsigned int num_points);
953#include <riscv_vector.h>
956static inline void volk_32f_x2_dot_prod_32f_rvv(
float* result,
959 unsigned int num_points)
961 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
962 size_t n = num_points;
963 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
964 vl = __riscv_vsetvl_e32m8(n);
965 vfloat32m8_t v0 = __riscv_vle32_v_f32m8(input, vl);
966 vfloat32m8_t v1 = __riscv_vle32_v_f32m8(taps, vl);
967 vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl);
969 size_t vl = __riscv_vsetvlmax_e32m1();
971 v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
972 *result = __riscv_vfmv_f(v);
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:698
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:476
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:92
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:314
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:68
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:167
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:551
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:851
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition volk_32f_x2_dot_prod_32f.h:898
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define RISCV_SHRINK8(op, T, S, v)
Definition volk_rvv_intrinsics.h:33