45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
57 unsigned int num_points)
60 static const int N_UNROLL = 4;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
70 for (
i = 0;
i < n;
i += N_UNROLL) {
71 acc0 += taps[
i + 0] * (float)input[
i + 0];
72 acc1 += taps[
i + 1] * (float)input[
i + 1];
73 acc2 += taps[
i + 2] * (float)input[
i + 2];
74 acc3 += taps[
i + 3] * (float)input[
i + 3];
77 for (;
i < num_points;
i++) {
78 acc0 += taps[
i] * (float)input[
i];
81 *result = acc0 + acc1 + acc2 + acc3;
91 unsigned int num_points)
95 unsigned quarter_points = num_points / 4;
97 short* inputPtr = (
short*)input;
100 float32x4x2_t tapsVal, accumulator_val;
103 float32x4_t input_float, prod_re, prod_im;
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((
float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
112 input32 = vmovl_s16(input16);
114 input_float = vcvtq_f32_s32(input32);
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
125 vst2q_f32((
float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
134 *result = accumulator_vec[0];
139#if LV_HAVE_SSE && LV_HAVE_MMX
141static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
144 unsigned int num_points)
147 unsigned int number = 0;
148 const unsigned int eighthPoints = num_points / 8;
151 const short* aPtr = input;
152 const float* bPtr = (
float*)taps;
155 __m128 f0, f1, f2, f3;
156 __m128 a0Val, a1Val, a2Val, a3Val;
157 __m128 b0Val, b1Val, b2Val, b3Val;
158 __m128 c0Val, c1Val, c2Val, c3Val;
160 __m128 dotProdVal0 = _mm_setzero_ps();
161 __m128 dotProdVal1 = _mm_setzero_ps();
162 __m128 dotProdVal2 = _mm_setzero_ps();
163 __m128 dotProdVal3 = _mm_setzero_ps();
165 for (; number < eighthPoints; number++) {
167 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
168 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
169 f0 = _mm_cvtpi16_ps(m0);
170 f1 = _mm_cvtpi16_ps(m0);
171 f2 = _mm_cvtpi16_ps(m1);
172 f3 = _mm_cvtpi16_ps(m1);
174 a0Val = _mm_unpacklo_ps(f0, f1);
175 a1Val = _mm_unpackhi_ps(f0, f1);
176 a2Val = _mm_unpacklo_ps(f2, f3);
177 a3Val = _mm_unpackhi_ps(f2, f3);
179 b0Val = _mm_loadu_ps(bPtr);
180 b1Val = _mm_loadu_ps(bPtr + 4);
181 b2Val = _mm_loadu_ps(bPtr + 8);
182 b3Val = _mm_loadu_ps(bPtr + 12);
184 c0Val = _mm_mul_ps(a0Val, b0Val);
185 c1Val = _mm_mul_ps(a1Val, b1Val);
186 c2Val = _mm_mul_ps(a2Val, b2Val);
187 c3Val = _mm_mul_ps(a3Val, b3Val);
189 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
190 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
191 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
192 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
200 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
206 _mm_store_ps(dotProductVector,
209 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
210 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
212 number = eighthPoints * 8;
213 for (; number < num_points; number++) {
214 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
219 *result = returnValue;
225#if LV_HAVE_AVX2 && LV_HAVE_FMA
227static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
230 unsigned int num_points)
233 unsigned int number = 0;
234 const unsigned int sixteenthPoints = num_points / 16;
237 const short* aPtr = input;
238 const float* bPtr = (
float*)taps;
242 __m256 g0, g1, h0, h1, h2, h3;
243 __m256 a0Val, a1Val, a2Val, a3Val;
244 __m256 b0Val, b1Val, b2Val, b3Val;
246 __m256 dotProdVal0 = _mm256_setzero_ps();
247 __m256 dotProdVal1 = _mm256_setzero_ps();
248 __m256 dotProdVal2 = _mm256_setzero_ps();
249 __m256 dotProdVal3 = _mm256_setzero_ps();
251 for (; number < sixteenthPoints; number++) {
253 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
254 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
256 f0 = _mm256_cvtepi16_epi32(m0);
257 g0 = _mm256_cvtepi32_ps(f0);
258 f1 = _mm256_cvtepi16_epi32(m1);
259 g1 = _mm256_cvtepi32_ps(f1);
261 h0 = _mm256_unpacklo_ps(g0, g0);
262 h1 = _mm256_unpackhi_ps(g0, g0);
263 h2 = _mm256_unpacklo_ps(g1, g1);
264 h3 = _mm256_unpackhi_ps(g1, g1);
266 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
267 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
268 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
269 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
271 b0Val = _mm256_loadu_ps(bPtr);
272 b1Val = _mm256_loadu_ps(bPtr + 8);
273 b2Val = _mm256_loadu_ps(bPtr + 16);
274 b3Val = _mm256_loadu_ps(bPtr + 24);
276 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
277 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
278 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
279 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
285 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
287 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291 _mm256_store_ps(dotProductVector,
294 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
295 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
296 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
297 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
299 number = sixteenthPoints * 16;
300 for (; number < num_points; number++) {
301 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
306 *result = returnValue;
314static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
317 unsigned int num_points)
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
324 const short* aPtr = input;
325 const float* bPtr = (
float*)taps;
329 __m256 g0, g1, h0, h1, h2, h3;
330 __m256 a0Val, a1Val, a2Val, a3Val;
331 __m256 b0Val, b1Val, b2Val, b3Val;
332 __m256 c0Val, c1Val, c2Val, c3Val;
334 __m256 dotProdVal0 = _mm256_setzero_ps();
335 __m256 dotProdVal1 = _mm256_setzero_ps();
336 __m256 dotProdVal2 = _mm256_setzero_ps();
337 __m256 dotProdVal3 = _mm256_setzero_ps();
339 for (; number < sixteenthPoints; number++) {
341 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
342 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
344 f0 = _mm256_cvtepi16_epi32(m0);
345 g0 = _mm256_cvtepi32_ps(f0);
346 f1 = _mm256_cvtepi16_epi32(m1);
347 g1 = _mm256_cvtepi32_ps(f1);
349 h0 = _mm256_unpacklo_ps(g0, g0);
350 h1 = _mm256_unpackhi_ps(g0, g0);
351 h2 = _mm256_unpacklo_ps(g1, g1);
352 h3 = _mm256_unpackhi_ps(g1, g1);
354 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
355 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
356 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
357 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
359 b0Val = _mm256_loadu_ps(bPtr);
360 b1Val = _mm256_loadu_ps(bPtr + 8);
361 b2Val = _mm256_loadu_ps(bPtr + 16);
362 b3Val = _mm256_loadu_ps(bPtr + 24);
364 c0Val = _mm256_mul_ps(a0Val, b0Val);
365 c1Val = _mm256_mul_ps(a1Val, b1Val);
366 c2Val = _mm256_mul_ps(a2Val, b2Val);
367 c3Val = _mm256_mul_ps(a3Val, b3Val);
369 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
370 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
371 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
372 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
378 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
379 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
380 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
384 _mm256_store_ps(dotProductVector,
387 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
388 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
389 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
390 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
392 number = sixteenthPoints * 16;
393 for (; number < num_points; number++) {
394 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
399 *result = returnValue;
405#if LV_HAVE_SSE && LV_HAVE_MMX
408static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
411 unsigned int num_points)
414 unsigned int number = 0;
415 const unsigned int eighthPoints = num_points / 8;
418 const short* aPtr = input;
419 const float* bPtr = (
float*)taps;
422 __m128 f0, f1, f2, f3;
423 __m128 a0Val, a1Val, a2Val, a3Val;
424 __m128 b0Val, b1Val, b2Val, b3Val;
425 __m128 c0Val, c1Val, c2Val, c3Val;
427 __m128 dotProdVal0 = _mm_setzero_ps();
428 __m128 dotProdVal1 = _mm_setzero_ps();
429 __m128 dotProdVal2 = _mm_setzero_ps();
430 __m128 dotProdVal3 = _mm_setzero_ps();
432 for (; number < eighthPoints; number++) {
434 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
435 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
436 f0 = _mm_cvtpi16_ps(m0);
437 f1 = _mm_cvtpi16_ps(m0);
438 f2 = _mm_cvtpi16_ps(m1);
439 f3 = _mm_cvtpi16_ps(m1);
441 a0Val = _mm_unpacklo_ps(f0, f1);
442 a1Val = _mm_unpackhi_ps(f0, f1);
443 a2Val = _mm_unpacklo_ps(f2, f3);
444 a3Val = _mm_unpackhi_ps(f2, f3);
446 b0Val = _mm_load_ps(bPtr);
447 b1Val = _mm_load_ps(bPtr + 4);
448 b2Val = _mm_load_ps(bPtr + 8);
449 b3Val = _mm_load_ps(bPtr + 12);
451 c0Val = _mm_mul_ps(a0Val, b0Val);
452 c1Val = _mm_mul_ps(a1Val, b1Val);
453 c2Val = _mm_mul_ps(a2Val, b2Val);
454 c3Val = _mm_mul_ps(a3Val, b3Val);
456 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
457 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
458 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
459 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
467 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
468 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
469 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
473 _mm_store_ps(dotProductVector,
476 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
477 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
479 number = eighthPoints * 8;
480 for (; number < num_points; number++) {
481 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
486 *result = returnValue;
493static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
496 unsigned int num_points)
499 unsigned int number = 0;
500 const unsigned int sixteenthPoints = num_points / 16;
503 const short* aPtr = input;
504 const float* bPtr = (
float*)taps;
508 __m256 g0, g1, h0, h1, h2, h3;
509 __m256 a0Val, a1Val, a2Val, a3Val;
510 __m256 b0Val, b1Val, b2Val, b3Val;
511 __m256 c0Val, c1Val, c2Val, c3Val;
513 __m256 dotProdVal0 = _mm256_setzero_ps();
514 __m256 dotProdVal1 = _mm256_setzero_ps();
515 __m256 dotProdVal2 = _mm256_setzero_ps();
516 __m256 dotProdVal3 = _mm256_setzero_ps();
518 for (; number < sixteenthPoints; number++) {
520 m0 = _mm_load_si128((__m128i
const*)aPtr);
521 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
523 f0 = _mm256_cvtepi16_epi32(m0);
524 g0 = _mm256_cvtepi32_ps(f0);
525 f1 = _mm256_cvtepi16_epi32(m1);
526 g1 = _mm256_cvtepi32_ps(f1);
528 h0 = _mm256_unpacklo_ps(g0, g0);
529 h1 = _mm256_unpackhi_ps(g0, g0);
530 h2 = _mm256_unpacklo_ps(g1, g1);
531 h3 = _mm256_unpackhi_ps(g1, g1);
533 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
534 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
535 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
536 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
538 b0Val = _mm256_load_ps(bPtr);
539 b1Val = _mm256_load_ps(bPtr + 8);
540 b2Val = _mm256_load_ps(bPtr + 16);
541 b3Val = _mm256_load_ps(bPtr + 24);
543 c0Val = _mm256_mul_ps(a0Val, b0Val);
544 c1Val = _mm256_mul_ps(a1Val, b1Val);
545 c2Val = _mm256_mul_ps(a2Val, b2Val);
546 c3Val = _mm256_mul_ps(a3Val, b3Val);
548 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
549 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
550 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
551 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
557 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
558 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
559 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
563 _mm256_store_ps(dotProductVector,
566 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
567 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
568 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
569 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
571 number = sixteenthPoints * 16;
572 for (; number < num_points; number++) {
573 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
578 *result = returnValue;
584#if LV_HAVE_AVX2 && LV_HAVE_FMA
586static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
589 unsigned int num_points)
592 unsigned int number = 0;
593 const unsigned int sixteenthPoints = num_points / 16;
596 const short* aPtr = input;
597 const float* bPtr = (
float*)taps;
601 __m256 g0, g1, h0, h1, h2, h3;
602 __m256 a0Val, a1Val, a2Val, a3Val;
603 __m256 b0Val, b1Val, b2Val, b3Val;
605 __m256 dotProdVal0 = _mm256_setzero_ps();
606 __m256 dotProdVal1 = _mm256_setzero_ps();
607 __m256 dotProdVal2 = _mm256_setzero_ps();
608 __m256 dotProdVal3 = _mm256_setzero_ps();
610 for (; number < sixteenthPoints; number++) {
612 m0 = _mm_load_si128((__m128i
const*)aPtr);
613 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
615 f0 = _mm256_cvtepi16_epi32(m0);
616 g0 = _mm256_cvtepi32_ps(f0);
617 f1 = _mm256_cvtepi16_epi32(m1);
618 g1 = _mm256_cvtepi32_ps(f1);
620 h0 = _mm256_unpacklo_ps(g0, g0);
621 h1 = _mm256_unpackhi_ps(g0, g0);
622 h2 = _mm256_unpacklo_ps(g1, g1);
623 h3 = _mm256_unpackhi_ps(g1, g1);
625 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
626 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
627 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
628 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
630 b0Val = _mm256_load_ps(bPtr);
631 b1Val = _mm256_load_ps(bPtr + 8);
632 b2Val = _mm256_load_ps(bPtr + 16);
633 b3Val = _mm256_load_ps(bPtr + 24);
635 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
636 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
637 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
638 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
644 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
645 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
646 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
650 _mm256_store_ps(dotProductVector,
653 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
654 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
655 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
656 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
658 number = sixteenthPoints * 16;
659 for (; number < num_points; number++) {
660 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
665 *result = returnValue;
672#include <riscv_vector.h>
675static inline void volk_16i_32fc_dot_prod_32fc_rvv(
lv_32fc_t* result,
678 unsigned int num_points)
680 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
681 vfloat32m4_t vsumi = vsumr;
682 size_t n = num_points;
683 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
684 vl = __riscv_vsetvl_e32m4(n);
685 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)taps, vl);
686 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
687 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
689 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((
const int16_t*)input, vl), vl);
690 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
691 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
693 size_t vl = __riscv_vsetvlmax_e32m1();
696 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
697 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
698 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
703#include <riscv_vector.h>
706static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(
lv_32fc_t* result,
709 unsigned int num_points)
711 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
712 vfloat32m4_t vsumi = vsumr;
713 size_t n = num_points;
714 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
715 vl = __riscv_vsetvl_e32m4(n);
716 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)taps, vl);
717 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
718 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
720 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((
const int16_t*)input, vl), vl);
721 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
722 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
724 size_t vl = __riscv_vsetvlmax_e32m1();
727 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
728 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
729 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_16i_32fc_dot_prod_32fc.h:88
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition volk_16i_32fc_dot_prod_32fc.h:54
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cmake(r, i)
Definition volk_complex.h:77
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24