45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
55extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(
lv_32fc_t* result,
58 unsigned int num_points);
67 unsigned int num_points)
70 float* res = (
float*)result;
71 float* in = (
float*)input;
72 float* tp = (
float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
79 for (
i = 0;
i < n_2_ccomplex_blocks; ++
i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
94 *result += input[num_points - 1] * taps[num_points - 1];
103#include <pmmintrin.h>
108 unsigned int num_points)
112 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
114 unsigned int number = 0;
115 const unsigned int halfPoints = num_points / 2;
116 unsigned int isodd = num_points & 1;
118 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
123 dotProdVal = _mm_setzero_ps();
125 for (; number < halfPoints; number++) {
127 x = _mm_loadu_ps((
float*)a);
128 y = _mm_loadu_ps((
float*)b);
130 yl = _mm_moveldup_ps(y);
131 yh = _mm_movehdup_ps(y);
133 tmp1 = _mm_mul_ps(x, yl);
135 x = _mm_shuffle_ps(x, x, 0xB1);
137 tmp2 = _mm_mul_ps(x, yh);
139 z = _mm_addsub_ps(tmp1,
143 _mm_add_ps(dotProdVal, z);
151 _mm_storeu_ps((
float*)dotProductVector,
154 dotProduct += (dotProductVector[0] + dotProductVector[1]);
157 dotProduct += input[num_points - 1] * taps[num_points - 1];
160 *result = dotProduct;
167#include <immintrin.h>
172 unsigned int num_points)
175 unsigned int isodd = num_points & 3;
178 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
180 unsigned int number = 0;
181 const unsigned int quarterPoints = num_points / 4;
183 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
188 dotProdVal = _mm256_setzero_ps();
190 for (; number < quarterPoints; number++) {
191 x = _mm256_loadu_ps((
float*)a);
192 y = _mm256_loadu_ps((
float*)b);
194 yl = _mm256_moveldup_ps(y);
195 yh = _mm256_movehdup_ps(y);
197 tmp1 = _mm256_mul_ps(x, yl);
199 x = _mm256_shuffle_ps(x, x, 0xB1);
201 tmp2 = _mm256_mul_ps(x, yh);
203 z = _mm256_addsub_ps(tmp1,
206 dotProdVal = _mm256_add_ps(dotProdVal,
215 _mm256_storeu_ps((
float*)dotProductVector,
218 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219 dotProductVector[3]);
221 for (
i = num_points - isodd;
i < num_points;
i++) {
222 dotProduct += input[
i] * taps[
i];
225 *result = dotProduct;
230#if LV_HAVE_AVX && LV_HAVE_FMA
231#include <immintrin.h>
233static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(
lv_32fc_t* result,
236 unsigned int num_points)
239 unsigned int isodd = num_points & 3;
242 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
247 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
252 dotProdVal = _mm256_setzero_ps();
254 for (; number < quarterPoints; number++) {
256 x = _mm256_loadu_ps((
float*)a);
257 y = _mm256_loadu_ps((
float*)b);
259 yl = _mm256_moveldup_ps(y);
260 yh = _mm256_movehdup_ps(y);
264 x = _mm256_shuffle_ps(x, x, 0xB1);
266 tmp2 = _mm256_mul_ps(x, yh);
268 z = _mm256_fmaddsub_ps(
271 dotProdVal = _mm256_add_ps(dotProdVal,
280 _mm256_storeu_ps((
float*)dotProductVector,
283 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
284 dotProductVector[3]);
286 for (
i = num_points - isodd;
i < num_points;
i++) {
287 dotProduct += input[
i] * taps[
i];
290 *result = dotProduct;
297#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
298#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
308#include <pmmintrin.h>
313 unsigned int num_points)
316 const unsigned int num_bytes = num_points * 8;
317 unsigned int isodd = num_points & 1;
320 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
322 unsigned int number = 0;
323 const unsigned int halfPoints = num_bytes >> 4;
325 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
330 dotProdVal = _mm_setzero_ps();
332 for (; number < halfPoints; number++) {
334 x = _mm_load_ps((
float*)a);
335 y = _mm_load_ps((
float*)b);
337 yl = _mm_moveldup_ps(y);
338 yh = _mm_movehdup_ps(y);
340 tmp1 = _mm_mul_ps(x, yl);
342 x = _mm_shuffle_ps(x, x, 0xB1);
344 tmp2 = _mm_mul_ps(x, yh);
346 z = _mm_addsub_ps(tmp1,
350 _mm_add_ps(dotProdVal, z);
358 _mm_store_ps((
float*)dotProductVector,
361 dotProduct += (dotProductVector[0] + dotProductVector[1]);
364 dotProduct += input[num_points - 1] * taps[num_points - 1];
367 *result = dotProduct;
379 unsigned int num_points)
382 unsigned int quarter_points = num_points / 4;
389 float32x4x2_t a_val, b_val, c_val, accumulator;
390 float32x4x2_t tmp_real, tmp_imag;
391 accumulator.val[0] = vdupq_n_f32(0);
392 accumulator.val[1] = vdupq_n_f32(0);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((
float*)a_ptr);
396 b_val = vld2q_f32((
float*)b_ptr);
402 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
404 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
408 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
410 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
412 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
415 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
422 vst2q_f32((
float*)accum_result, accumulator);
423 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
426 for (number = quarter_points * 4; number < num_points; ++number) {
427 *result += (*a_ptr++) * (*b_ptr++);
437 unsigned int num_points)
440 unsigned int quarter_points = num_points / 4;
447 float32x4x2_t a_val, b_val, accumulator;
448 float32x4x2_t tmp_imag;
449 accumulator.val[0] = vdupq_n_f32(0);
450 accumulator.val[1] = vdupq_n_f32(0);
452 for (number = 0; number < quarter_points; ++number) {
453 a_val = vld2q_f32((
float*)a_ptr);
454 b_val = vld2q_f32((
float*)b_ptr);
459 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
463 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
466 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
474 vst2q_f32((
float*)accum_result, accumulator);
475 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
478 for (number = quarter_points * 4; number < num_points; ++number) {
479 *result += (*a_ptr++) * (*b_ptr++);
488 unsigned int num_points)
491 unsigned int quarter_points = num_points / 4;
498 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499 accumulator1.val[0] = vdupq_n_f32(0);
500 accumulator1.val[1] = vdupq_n_f32(0);
501 accumulator2.val[0] = vdupq_n_f32(0);
502 accumulator2.val[1] = vdupq_n_f32(0);
504 for (number = 0; number < quarter_points; ++number) {
505 a_val = vld2q_f32((
float*)a_ptr);
506 b_val = vld2q_f32((
float*)b_ptr);
511 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
519 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
522 vst2q_f32((
float*)accum_result, accumulator1);
523 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
526 for (number = quarter_points * 4; number < num_points; ++number) {
527 *result += (*a_ptr++) * (*b_ptr++);
536 unsigned int num_points)
541 unsigned int quarter_points = num_points / 8;
548 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549 float32x4x2_t reduced_accumulator;
550 accumulator1.val[0] = vdupq_n_f32(0);
551 accumulator1.val[1] = vdupq_n_f32(0);
552 accumulator1.val[2] = vdupq_n_f32(0);
553 accumulator1.val[3] = vdupq_n_f32(0);
554 accumulator2.val[0] = vdupq_n_f32(0);
555 accumulator2.val[1] = vdupq_n_f32(0);
556 accumulator2.val[2] = vdupq_n_f32(0);
557 accumulator2.val[3] = vdupq_n_f32(0);
560 for (number = 0; number < quarter_points; ++number) {
561 a_val = vld4q_f32((
float*)a_ptr);
562 b_val = vld4q_f32((
float*)b_ptr);
567 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
570 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
573 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
576 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
583 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
591 vst2q_f32((
float*)accum_result, reduced_accumulator);
592 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
595 for (number = quarter_points * 8; number < num_points; ++number) {
596 *result += (*a_ptr++) * (*b_ptr++);
604#include <immintrin.h>
609 unsigned int num_points)
612 unsigned int isodd = num_points & 3;
615 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
617 unsigned int number = 0;
618 const unsigned int quarterPoints = num_points / 4;
620 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
625 dotProdVal = _mm256_setzero_ps();
627 for (; number < quarterPoints; number++) {
629 x = _mm256_load_ps((
float*)a);
630 y = _mm256_load_ps((
float*)b);
632 yl = _mm256_moveldup_ps(y);
633 yh = _mm256_movehdup_ps(y);
635 tmp1 = _mm256_mul_ps(x, yl);
637 x = _mm256_shuffle_ps(x, x, 0xB1);
639 tmp2 = _mm256_mul_ps(x, yh);
641 z = _mm256_addsub_ps(tmp1,
644 dotProdVal = _mm256_add_ps(dotProdVal,
653 _mm256_store_ps((
float*)dotProductVector,
656 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
657 dotProductVector[3]);
659 for (
i = num_points - isodd;
i < num_points;
i++) {
660 dotProduct += input[
i] * taps[
i];
663 *result = dotProduct;
668#if LV_HAVE_AVX && LV_HAVE_FMA
669#include <immintrin.h>
671static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(
lv_32fc_t* result,
674 unsigned int num_points)
677 unsigned int isodd = num_points & 3;
680 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
682 unsigned int number = 0;
683 const unsigned int quarterPoints = num_points / 4;
685 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
690 dotProdVal = _mm256_setzero_ps();
692 for (; number < quarterPoints; number++) {
694 x = _mm256_load_ps((
float*)a);
695 y = _mm256_load_ps((
float*)b);
697 yl = _mm256_moveldup_ps(y);
698 yh = _mm256_movehdup_ps(y);
702 x = _mm256_shuffle_ps(x, x, 0xB1);
704 tmp2 = _mm256_mul_ps(x, yh);
706 z = _mm256_fmaddsub_ps(
709 dotProdVal = _mm256_add_ps(dotProdVal,
718 _mm256_store_ps((
float*)dotProductVector,
721 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
722 dotProductVector[3]);
724 for (
i = num_points - isodd;
i < num_points;
i++) {
725 dotProduct += input[
i] * taps[
i];
728 *result = dotProduct;
734#include <riscv_vector.h>
737static inline void volk_32fc_x2_dot_prod_32fc_rvv(
lv_32fc_t* result,
740 unsigned int num_points)
742 vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
743 vfloat32m2_t vsumi = vsumr;
744 size_t n = num_points;
745 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746 vl = __riscv_vsetvl_e32m2(n);
747 vuint64m4_t va = __riscv_vle64_v_u64m4((
const uint64_t*)input, vl);
748 vuint64m4_t vb = __riscv_vle64_v_u64m4((
const uint64_t*)taps, vl);
749 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
750 vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
751 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
752 vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
753 vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
754 vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
755 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
756 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
758 size_t vl = __riscv_vsetvlmax_e32m1();
761 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
762 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
763 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
768#include <riscv_vector.h>
771static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(
lv_32fc_t* result,
774 unsigned int num_points)
776 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
777 vfloat32m4_t vsumi = vsumr;
778 size_t n = num_points;
779 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
780 vl = __riscv_vsetvl_e32m4(n);
781 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((
const float*)input, vl);
782 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)taps, vl);
783 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
784 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
785 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
786 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
787 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
788 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
790 size_t vl = __riscv_vsetvlmax_e32m1();
793 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
794 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
795 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
static void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:533
static void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:310
static void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:606
static void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:169
static void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:376
static void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:64
static void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:105
static void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:434
static void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t *result, const lv_32fc_t *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_32fc_x2_dot_prod_32fc.h:485
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cmake(r, i)
Definition volk_complex.h:77
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK2(op, T, S, v)
Definition volk_rvv_intrinsics.h:19
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24