57#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
58#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
65#if LV_HAVE_AVX2 && LV_HAVE_FMA
74static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(
lv_32fc_t* cVector,
77 unsigned int num_points)
79 unsigned int number = 0;
80 const unsigned int quarterPoints = num_points / 4;
86 for (; number < quarterPoints; number++) {
89 _mm256_loadu_ps((
float*)a);
91 _mm256_loadu_ps((
float*)b);
93 const __m256 yl = _mm256_moveldup_ps(y);
94 const __m256 yh = _mm256_movehdup_ps(y);
96 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
98 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
100 const __m256 z = _mm256_fmaddsub_ps(
103 _mm256_storeu_ps((
float*)c, z);
110 number = quarterPoints * 4;
111 for (; number < num_points; number++) {
112 *c++ = (*a++) * (*b++);
119#include <immintrin.h>
125 unsigned int num_points)
127 unsigned int number = 0;
128 const unsigned int quarterPoints = num_points / 4;
135 for (; number < quarterPoints; number++) {
141 _mm256_storeu_ps((
float*)c, z);
148 number = quarterPoints * 4;
150 for (; number < num_points; number++) {
151 *c++ = (*a++) * (*b++);
158#include <pmmintrin.h>
164 unsigned int num_points)
166 unsigned int number = 0;
167 const unsigned int halfPoints = num_points / 2;
174 for (; number < halfPoints; number++) {
175 x = _mm_loadu_ps((
float*)a);
176 y = _mm_loadu_ps((
float*)b);
178 _mm_storeu_ps((
float*)c, z);
185 if ((num_points % 2) != 0) {
192#ifdef LV_HAVE_GENERIC
197 unsigned int num_points)
202 unsigned int number = 0;
204 for (number = 0; number < num_points; number++) {
205 *cPtr++ = (*aPtr++) * (*bPtr++);
212#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
213#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
220#if LV_HAVE_AVX2 && LV_HAVE_FMA
221#include <immintrin.h>
229static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(
lv_32fc_t* cVector,
232 unsigned int num_points)
234 unsigned int number = 0;
235 const unsigned int quarterPoints = num_points / 4;
241 for (; number < quarterPoints; number++) {
244 _mm256_load_ps((
float*)a);
246 _mm256_load_ps((
float*)b);
248 const __m256 yl = _mm256_moveldup_ps(y);
249 const __m256 yh = _mm256_movehdup_ps(y);
251 const __m256 tmp2x = _mm256_permute_ps(x, 0xB1);
253 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
255 const __m256 z = _mm256_fmaddsub_ps(
258 _mm256_store_ps((
float*)c, z);
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 *c++ = (*a++) * (*b++);
274#include <immintrin.h>
280 unsigned int num_points)
282 unsigned int number = 0;
283 const unsigned int quarterPoints = num_points / 4;
290 for (; number < quarterPoints; number++) {
291 x = _mm256_load_ps((
float*)a);
292 y = _mm256_load_ps((
float*)b);
294 _mm256_store_ps((
float*)c, z);
301 number = quarterPoints * 4;
303 for (; number < num_points; number++) {
304 *c++ = (*a++) * (*b++);
310#include <pmmintrin.h>
316 unsigned int num_points)
318 unsigned int number = 0;
319 const unsigned int halfPoints = num_points / 2;
326 for (; number < halfPoints; number++) {
327 x = _mm_load_ps((
float*)a);
328 y = _mm_load_ps((
float*)b);
330 _mm_store_ps((
float*)c, z);
337 if ((num_points % 2) != 0) {
350 unsigned int num_points)
354 unsigned int quarter_points = num_points / 4;
355 float32x4x2_t a_val, b_val, c_val;
356 float32x4x2_t tmp_real, tmp_imag;
357 unsigned int number = 0;
359 for (number = 0; number < quarter_points; ++number) {
360 a_val = vld2q_f32((
float*)a_ptr);
361 b_val = vld2q_f32((
float*)b_ptr);
367 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
369 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
373 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
375 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
378 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
379 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
380 vst2q_f32((
float*)cVector, c_val);
387 for (number = quarter_points * 4; number < num_points; number++) {
388 *cVector++ = (*a_ptr++) * (*b_ptr++);
399 unsigned int num_points)
403 unsigned int quarter_points = num_points / 4;
404 float32x4x2_t a_val, b_val;
405 float32x4x2_t tmp_imag;
406 unsigned int number = 0;
408 for (number = 0; number < quarter_points; ++number) {
409 a_val = vld2q_f32((
float*)a_ptr);
410 b_val = vld2q_f32((
float*)b_ptr);
415 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
416 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
419 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
420 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
423 vst2q_f32((
float*)cVector, tmp_imag);
430 for (number = quarter_points * 4; number < num_points; number++) {
431 *cVector++ = (*a_ptr++) * (*b_ptr++);
439extern void volk_32fc_x2_multiply_32fc_a_neonasm(
lv_32fc_t* cVector,
442 unsigned int num_points);
448extern void volk_32fc_x2_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
453static inline void volk_32fc_x2_multiply_32fc_u_orc(
lv_32fc_t* cVector,
456 unsigned int num_points)
458 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
464#include <riscv_vector.h>
466static inline void volk_32fc_x2_multiply_32fc_rvv(
lv_32fc_t* cVector,
469 unsigned int num_points)
471 size_t n = num_points;
472 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
473 vl = __riscv_vsetvl_e32m4(n);
474 vuint64m8_t va = __riscv_vle64_v_u64m8((
const uint64_t*)aVector, vl);
475 vuint64m8_t vb = __riscv_vle64_v_u64m8((
const uint64_t*)bVector, vl);
476 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
477 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
478 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
479 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
480 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
481 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
482 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
483 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
485 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
486 __riscv_vse64((uint64_t*)cVector, v, vl);
492#include <riscv_vector.h>
494static inline void volk_32fc_x2_multiply_32fc_rvvseg(
lv_32fc_t* cVector,
497 unsigned int num_points)
499 size_t n = num_points;
500 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
501 vl = __riscv_vsetvl_e32m4(n);
502 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((
const float*)aVector, vl);
503 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)bVector, vl);
504 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
505 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
506 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
507 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
508 __riscv_vsseg2e32_v_f32m4x2(
509 (
float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:313
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:122
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:194
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:396
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:347
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:277
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_multiply_32fc.h:161
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition volk_avx_intrinsics.h:57
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
float complex lv_32fc_t
Definition volk_complex.h:74
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition volk_sse3_intrinsics.h:19