61#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62#define INCLUDED_volk_32f_x2_add_32f_u_H
70static inline void volk_32f_x2_add_32f_u_avx512f(
float* cVector,
73 unsigned int num_points)
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
88 cVal = _mm512_add_ps(aVal, bVal);
90 _mm512_storeu_ps(cPtr, cVal);
97 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
108#include <immintrin.h>
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
115 unsigned int number = 0;
116 const unsigned int eighthPoints = num_points / 8;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121 for (; number < eighthPoints; number++) {
123 aVal = _mm256_loadu_ps(aPtr);
124 bVal = _mm256_loadu_ps(bPtr);
126 cVal = _mm256_add_ps(aVal, bVal);
128 _mm256_storeu_ps(cPtr, cVal);
135 number = eighthPoints * 8;
137 for (; number < num_points; number++) {
138 *cPtr++ = (*aPtr++) + (*bPtr++);
145#include <xmmintrin.h>
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
152 unsigned int number = 0;
153 const unsigned int quarterPoints = num_points / 4;
155 float* cPtr = cVector;
156 const float* aPtr = aVector;
157 const float* bPtr = bVector;
159 __m128 aVal, bVal, cVal;
160 for (; number < quarterPoints; number++) {
162 aVal = _mm_loadu_ps(aPtr);
163 bVal = _mm_loadu_ps(bPtr);
165 cVal = _mm_add_ps(aVal, bVal);
167 _mm_storeu_ps(cPtr, cVal);
174 number = quarterPoints * 4;
175 for (; number < num_points; number++) {
176 *cPtr++ = (*aPtr++) + (*bPtr++);
182#ifdef LV_HAVE_GENERIC
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 const float* bPtr = bVector;
192 unsigned int number = 0;
194 for (number = 0; number < num_points; number++) {
195 *cPtr++ = (*aPtr++) + (*bPtr++);
202#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203#define INCLUDED_volk_32f_x2_add_32f_a_H
208#ifdef LV_HAVE_AVX512F
209#include <immintrin.h>
211static inline void volk_32f_x2_add_32f_a_avx512f(
float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
229 cVal = _mm512_add_ps(aVal, bVal);
231 _mm512_store_ps(cPtr, cVal);
238 number = sixteenthPoints * 16;
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
249#include <immintrin.h>
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
256 unsigned int number = 0;
257 const unsigned int eighthPoints = num_points / 8;
259 float* cPtr = cVector;
260 const float* aPtr = aVector;
261 const float* bPtr = bVector;
263 __m256 aVal, bVal, cVal;
264 for (; number < eighthPoints; number++) {
266 aVal = _mm256_load_ps(aPtr);
267 bVal = _mm256_load_ps(bPtr);
269 cVal = _mm256_add_ps(aVal, bVal);
271 _mm256_store_ps(cPtr, cVal);
278 number = eighthPoints * 8;
279 for (; number < num_points; number++) {
280 *cPtr++ = (*aPtr++) + (*bPtr++);
286#include <xmmintrin.h>
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
293 unsigned int number = 0;
294 const unsigned int quarterPoints = num_points / 4;
296 float* cPtr = cVector;
297 const float* aPtr = aVector;
298 const float* bPtr = bVector;
300 __m128 aVal, bVal, cVal;
301 for (; number < quarterPoints; number++) {
302 aVal = _mm_load_ps(aPtr);
303 bVal = _mm_load_ps(bPtr);
305 cVal = _mm_add_ps(aVal, bVal);
307 _mm_store_ps(cPtr, cVal);
314 number = quarterPoints * 4;
315 for (; number < num_points; number++) {
316 *cPtr++ = (*aPtr++) + (*bPtr++);
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
345 cVal = vaddq_f32(aVal, bVal);
347 vst1q_f32(cPtr, cVal);
354 number = quarterPoints * 4;
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
363extern void volk_32f_x2_add_32f_a_neonasm(
float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
370extern void volk_32f_x2_add_32f_a_neonpipeline(
float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
379extern void volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
380 const float* aVector,
381 const float* bVector,
384static inline void volk_32f_x2_add_32f_u_orc(
float* cVector,
385 const float* aVector,
386 const float* bVector,
387 unsigned int num_points)
389 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
395#include <riscv_vector.h>
397static inline void volk_32f_x2_add_32f_rvv(
float* cVector,
398 const float* aVector,
399 const float* bVector,
400 unsigned int num_points)
402 size_t n = num_points;
403 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
404 vl = __riscv_vsetvl_e32m8(n);
405 vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
406 vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
407 __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl);
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:325
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:147
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:251
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:110
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:288
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:184
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68