61#ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
62#define INCLUDED_volk_32fc_x2_add_32fc_u_H
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
79 __m256 aVal, bVal, cVal;
80 for (; number < quarterPoints; number++) {
82 aVal = _mm256_loadu_ps((
float*)aPtr);
83 bVal = _mm256_loadu_ps((
float*)bPtr);
85 cVal = _mm256_add_ps(aVal, bVal);
87 _mm256_storeu_ps((
float*)cPtr,
95 number = quarterPoints * 4;
96 for (; number < num_points; number++) {
97 *cPtr++ = (*aPtr++) + (*bPtr++);
104#include <immintrin.h>
109 unsigned int num_points)
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
118 __m256 aVal, bVal, cVal;
119 for (; number < quarterPoints; number++) {
121 aVal = _mm256_load_ps((
float*)aPtr);
122 bVal = _mm256_load_ps((
float*)bPtr);
124 cVal = _mm256_add_ps(aVal, bVal);
126 _mm256_store_ps((
float*)cPtr,
134 number = quarterPoints * 4;
135 for (; number < num_points; number++) {
136 *cPtr++ = (*aPtr++) + (*bPtr++);
143#include <xmmintrin.h>
148 unsigned int num_points)
150 unsigned int number = 0;
151 const unsigned int halfPoints = num_points / 2;
157 __m128 aVal, bVal, cVal;
158 for (; number < halfPoints; number++) {
160 aVal = _mm_loadu_ps((
float*)aPtr);
161 bVal = _mm_loadu_ps((
float*)bPtr);
163 cVal = _mm_add_ps(aVal, bVal);
165 _mm_storeu_ps((
float*)cPtr, cVal);
172 number = halfPoints * 2;
173 for (; number < num_points; number++) {
174 *cPtr++ = (*aPtr++) + (*bPtr++);
180#ifdef LV_HAVE_GENERIC
185 unsigned int num_points)
190 unsigned int number = 0;
192 for (number = 0; number < num_points; number++) {
193 *cPtr++ = (*aPtr++) + (*bPtr++);
200#include <xmmintrin.h>
205 unsigned int num_points)
207 unsigned int number = 0;
208 const unsigned int halfPoints = num_points / 2;
214 __m128 aVal, bVal, cVal;
215 for (; number < halfPoints; number++) {
216 aVal = _mm_load_ps((
float*)aPtr);
217 bVal = _mm_load_ps((
float*)bPtr);
219 cVal = _mm_add_ps(aVal, bVal);
221 _mm_store_ps((
float*)cPtr, cVal);
228 number = halfPoints * 2;
229 for (; number < num_points; number++) {
230 *cPtr++ = (*aPtr++) + (*bPtr++);
242 unsigned int num_points)
244 unsigned int number = 0;
245 const unsigned int halfPoints = num_points / 2;
250 float32x4_t aVal, bVal, cVal;
251 for (number = 0; number < halfPoints; number++) {
253 aVal = vld1q_f32((
const float32_t*)(aPtr));
254 bVal = vld1q_f32((
const float32_t*)(bPtr));
259 cVal = vaddq_f32(aVal, bVal);
261 vst1q_f32((
float*)(cPtr), cVal);
268 number = halfPoints * 2;
269 for (; number < num_points; number++) {
270 *cPtr++ = (*aPtr++) + (*bPtr++);
277#include <riscv_vector.h>
279static inline void volk_32fc_x2_add_32fc_rvv(
lv_32fc_t* cVector,
282 unsigned int num_points)
284 const float* ina = (
const float*)aVector;
285 const float* inb = (
const float*)bVector;
286 float* out = (
float*)cVector;
287 size_t n = num_points * 2;
288 for (
size_t vl; n > 0; n -= vl, ina += vl, inb += vl, out += vl) {
289 vl = __riscv_vsetvl_e32m8(n);
290 vfloat32m8_t va = __riscv_vle32_v_f32m8(ina, vl);
291 vfloat32m8_t vb = __riscv_vle32_v_f32m8(inb, vl);
292 __riscv_vse32(out, __riscv_vfadd(va, vb, vl), vl);
static void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:202
static void volk_32fc_x2_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:182
static void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:145
static void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:67
static void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:106
static void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_add_32fc.h:239
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
float complex lv_32fc_t
Definition volk_complex.h:74