Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_add_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60
61#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62#define INCLUDED_volk_32f_x2_add_32f_u_H
63
64#include <inttypes.h>
65#include <stdio.h>
66
67#ifdef LV_HAVE_AVX512F
68#include <immintrin.h>
69
70static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
71 const float* aVector,
72 const float* bVector,
73 unsigned int num_points)
74{
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
77
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
81
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
84
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
87
88 cVal = _mm512_add_ps(aVal, bVal);
89
90 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
91
92 aPtr += 16;
93 bPtr += 16;
94 cPtr += 16;
95 }
96
97 number = sixteenthPoints * 16;
98
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
101 }
102}
103
104#endif /* LV_HAVE_AVX512F */
105
106
107#ifdef LV_HAVE_AVX
108#include <immintrin.h>
109
110static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
114{
115 unsigned int number = 0;
116 const unsigned int eighthPoints = num_points / 8;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121 for (; number < eighthPoints; number++) {
122
123 aVal = _mm256_loadu_ps(aPtr);
124 bVal = _mm256_loadu_ps(bPtr);
125
126 cVal = _mm256_add_ps(aVal, bVal);
127
128 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
129
130 aPtr += 8;
131 bPtr += 8;
132 cPtr += 8;
133 }
134
135 number = eighthPoints * 8;
136
137 for (; number < num_points; number++) {
138 *cPtr++ = (*aPtr++) + (*bPtr++);
139 }
140}
141#endif /* LV_HAVE_AVX */
142
143
144#ifdef LV_HAVE_SSE
145#include <xmmintrin.h>
146
147static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
151{
152 unsigned int number = 0;
153 const unsigned int quarterPoints = num_points / 4;
154
155 float* cPtr = cVector;
156 const float* aPtr = aVector;
157 const float* bPtr = bVector;
158
159 __m128 aVal, bVal, cVal;
160 for (; number < quarterPoints; number++) {
161
162 aVal = _mm_loadu_ps(aPtr);
163 bVal = _mm_loadu_ps(bPtr);
164
165 cVal = _mm_add_ps(aVal, bVal);
166
167 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
168
169 aPtr += 4;
170 bPtr += 4;
171 cPtr += 4;
172 }
173
174 number = quarterPoints * 4;
175 for (; number < num_points; number++) {
176 *cPtr++ = (*aPtr++) + (*bPtr++);
177 }
178}
179#endif /* LV_HAVE_SSE */
180
181
182#ifdef LV_HAVE_GENERIC
183
184static inline void volk_32f_x2_add_32f_generic(float* cVector,
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
188{
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 const float* bPtr = bVector;
192 unsigned int number = 0;
193
194 for (number = 0; number < num_points; number++) {
195 *cPtr++ = (*aPtr++) + (*bPtr++);
196 }
197}
198#endif /* LV_HAVE_GENERIC */
199
200
201#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
202#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203#define INCLUDED_volk_32f_x2_add_32f_a_H
204
205#include <inttypes.h>
206#include <stdio.h>
207
208#ifdef LV_HAVE_AVX512F
209#include <immintrin.h>
210
211static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
215{
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
218
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
222
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
225
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
228
229 cVal = _mm512_add_ps(aVal, bVal);
230
231 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
232
233 aPtr += 16;
234 bPtr += 16;
235 cPtr += 16;
236 }
237
238 number = sixteenthPoints * 16;
239
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
242 }
243}
244
245#endif /* LV_HAVE_AVX512F */
246
247
248#ifdef LV_HAVE_AVX
249#include <immintrin.h>
250
251static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
255{
256 unsigned int number = 0;
257 const unsigned int eighthPoints = num_points / 8;
258
259 float* cPtr = cVector;
260 const float* aPtr = aVector;
261 const float* bPtr = bVector;
262
263 __m256 aVal, bVal, cVal;
264 for (; number < eighthPoints; number++) {
265
266 aVal = _mm256_load_ps(aPtr);
267 bVal = _mm256_load_ps(bPtr);
268
269 cVal = _mm256_add_ps(aVal, bVal);
270
271 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
272
273 aPtr += 8;
274 bPtr += 8;
275 cPtr += 8;
276 }
277
278 number = eighthPoints * 8;
279 for (; number < num_points; number++) {
280 *cPtr++ = (*aPtr++) + (*bPtr++);
281 }
282}
283#endif /* LV_HAVE_AVX */
284
285#ifdef LV_HAVE_SSE
286#include <xmmintrin.h>
287
288static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
292{
293 unsigned int number = 0;
294 const unsigned int quarterPoints = num_points / 4;
295
296 float* cPtr = cVector;
297 const float* aPtr = aVector;
298 const float* bPtr = bVector;
299
300 __m128 aVal, bVal, cVal;
301 for (; number < quarterPoints; number++) {
302 aVal = _mm_load_ps(aPtr);
303 bVal = _mm_load_ps(bPtr);
304
305 cVal = _mm_add_ps(aVal, bVal);
306
307 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
308
309 aPtr += 4;
310 bPtr += 4;
311 cPtr += 4;
312 }
313
314 number = quarterPoints * 4;
315 for (; number < num_points; number++) {
316 *cPtr++ = (*aPtr++) + (*bPtr++);
317 }
318}
319#endif /* LV_HAVE_SSE */
320
321
322#ifdef LV_HAVE_NEON
323#include <arm_neon.h>
324
325static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
329{
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
332
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
338 // Load in to NEON registers
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
341 __VOLK_PREFETCH(aPtr + 4);
342 __VOLK_PREFETCH(bPtr + 4);
343
344 // vector add
345 cVal = vaddq_f32(aVal, bVal);
346 // Store the results back into the C container
347 vst1q_f32(cPtr, cVal);
348
349 aPtr += 4; // q uses quadwords, 4 floats per vadd
350 bPtr += 4;
351 cPtr += 4;
352 }
353
354 number = quarterPoints * 4; // should be = num_points
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
357 }
358}
359
360#endif /* LV_HAVE_NEON */
361
362#ifdef LV_HAVE_NEONV7
363extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
367#endif /* LV_HAVE_NEONV7 */
368
369#ifdef LV_HAVE_NEONV7
370extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
374#endif /* LV_HAVE_NEONV7 */
375
376
377#ifdef LV_HAVE_ORC
378
379extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
380 const float* aVector,
381 const float* bVector,
382 int num_points);
383
384static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
385 const float* aVector,
386 const float* bVector,
387 unsigned int num_points)
388{
389 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
390}
391
392#endif /* LV_HAVE_ORC */
393
394#ifdef LV_HAVE_RVV
395#include <riscv_vector.h>
396
397static inline void volk_32f_x2_add_32f_rvv(float* cVector,
398 const float* aVector,
399 const float* bVector,
400 unsigned int num_points)
401{
402 size_t n = num_points;
403 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
404 vl = __riscv_vsetvl_e32m8(n);
405 vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
406 vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
407 __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl);
408 }
409}
410#endif /*LV_HAVE_RVV*/
411
412#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:325
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:147
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:251
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:110
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:288
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_add_32f.h:184
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68