Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_multiply_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
59#define INCLUDED_volk_32f_x2_multiply_32f_u_H
60
61#include <inttypes.h>
62#include <stdio.h>
63
64#ifdef LV_HAVE_SSE
65#include <xmmintrin.h>
66
67static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
68 const float* aVector,
69 const float* bVector,
70 unsigned int num_points)
71{
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
74
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
78
79 __m128 aVal, bVal, cVal;
80 for (; number < quarterPoints; number++) {
81
82 aVal = _mm_loadu_ps(aPtr);
83 bVal = _mm_loadu_ps(bPtr);
84
85 cVal = _mm_mul_ps(aVal, bVal);
86
87 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
88
89 aPtr += 4;
90 bPtr += 4;
91 cPtr += 4;
92 }
93
94 number = quarterPoints * 4;
95 for (; number < num_points; number++) {
96 *cPtr++ = (*aPtr++) * (*bPtr++);
97 }
98}
99#endif /* LV_HAVE_SSE */
100
101#ifdef LV_HAVE_AVX512F
102#include <immintrin.h>
103
104static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
108{
109 unsigned int number = 0;
110 const unsigned int sixteenthPoints = num_points / 16;
111
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
115
116 __m512 aVal, bVal, cVal;
117 for (; number < sixteenthPoints; number++) {
118
119 aVal = _mm512_loadu_ps(aPtr);
120 bVal = _mm512_loadu_ps(bPtr);
121
122 cVal = _mm512_mul_ps(aVal, bVal);
123
124 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
125
126 aPtr += 16;
127 bPtr += 16;
128 cPtr += 16;
129 }
130
131 number = sixteenthPoints * 16;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) * (*bPtr++);
134 }
135}
136#endif /* LV_HAVE_AVX512F */
137
138#ifdef LV_HAVE_AVX
139#include <immintrin.h>
140
141static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int eighthPoints = num_points / 8;
148
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
152
153 __m256 aVal, bVal, cVal;
154 for (; number < eighthPoints; number++) {
155
156 aVal = _mm256_loadu_ps(aPtr);
157 bVal = _mm256_loadu_ps(bPtr);
158
159 cVal = _mm256_mul_ps(aVal, bVal);
160
161 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
162
163 aPtr += 8;
164 bPtr += 8;
165 cPtr += 8;
166 }
167
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 *cPtr++ = (*aPtr++) * (*bPtr++);
171 }
172}
173#endif /* LV_HAVE_AVX */
174
175
176#ifdef LV_HAVE_GENERIC
177
178static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
182{
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186 unsigned int number = 0;
187
188 for (number = 0; number < num_points; number++) {
189 *cPtr++ = (*aPtr++) * (*bPtr++);
190 }
191}
192#endif /* LV_HAVE_GENERIC */
193
194
195#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
196
197
198#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
199#define INCLUDED_volk_32f_x2_multiply_32f_a_H
200
201#include <inttypes.h>
202#include <stdio.h>
203
204#ifdef LV_HAVE_SSE
205#include <xmmintrin.h>
206
207static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
208 const float* aVector,
209 const float* bVector,
210 unsigned int num_points)
211{
212 unsigned int number = 0;
213 const unsigned int quarterPoints = num_points / 4;
214
215 float* cPtr = cVector;
216 const float* aPtr = aVector;
217 const float* bPtr = bVector;
218
219 __m128 aVal, bVal, cVal;
220 for (; number < quarterPoints; number++) {
221
222 aVal = _mm_load_ps(aPtr);
223 bVal = _mm_load_ps(bPtr);
224
225 cVal = _mm_mul_ps(aVal, bVal);
226
227 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
228
229 aPtr += 4;
230 bPtr += 4;
231 cPtr += 4;
232 }
233
234 number = quarterPoints * 4;
235 for (; number < num_points; number++) {
236 *cPtr++ = (*aPtr++) * (*bPtr++);
237 }
238}
239#endif /* LV_HAVE_SSE */
240
241#ifdef LV_HAVE_AVX512F
242#include <immintrin.h>
243
244static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
245 const float* aVector,
246 const float* bVector,
247 unsigned int num_points)
248{
249 unsigned int number = 0;
250 const unsigned int sixteenthPoints = num_points / 16;
251
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
255
256 __m512 aVal, bVal, cVal;
257 for (; number < sixteenthPoints; number++) {
258
259 aVal = _mm512_load_ps(aPtr);
260 bVal = _mm512_load_ps(bPtr);
261
262 cVal = _mm512_mul_ps(aVal, bVal);
263
264 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
265
266 aPtr += 16;
267 bPtr += 16;
268 cPtr += 16;
269 }
270
271 number = sixteenthPoints * 16;
272 for (; number < num_points; number++) {
273 *cPtr++ = (*aPtr++) * (*bPtr++);
274 }
275}
276#endif /* LV_HAVE_AVX512F */
277
278
279#ifdef LV_HAVE_AVX
280#include <immintrin.h>
281
282static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
283 const float* aVector,
284 const float* bVector,
285 unsigned int num_points)
286{
287 unsigned int number = 0;
288 const unsigned int eighthPoints = num_points / 8;
289
290 float* cPtr = cVector;
291 const float* aPtr = aVector;
292 const float* bPtr = bVector;
293
294 __m256 aVal, bVal, cVal;
295 for (; number < eighthPoints; number++) {
296
297 aVal = _mm256_load_ps(aPtr);
298 bVal = _mm256_load_ps(bPtr);
299
300 cVal = _mm256_mul_ps(aVal, bVal);
301
302 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
303
304 aPtr += 8;
305 bPtr += 8;
306 cPtr += 8;
307 }
308
309 number = eighthPoints * 8;
310 for (; number < num_points; number++) {
311 *cPtr++ = (*aPtr++) * (*bPtr++);
312 }
313}
314#endif /* LV_HAVE_AVX */
315
316
317#ifdef LV_HAVE_NEON
318#include <arm_neon.h>
319
320static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
321 const float* aVector,
322 const float* bVector,
323 unsigned int num_points)
324{
325 const unsigned int quarter_points = num_points / 4;
326 unsigned int number;
327 float32x4_t avec, bvec, cvec;
328 for (number = 0; number < quarter_points; ++number) {
329 avec = vld1q_f32(aVector);
330 bvec = vld1q_f32(bVector);
331 cvec = vmulq_f32(avec, bvec);
332 vst1q_f32(cVector, cvec);
333 aVector += 4;
334 bVector += 4;
335 cVector += 4;
336 }
337 for (number = quarter_points * 4; number < num_points; ++number) {
338 *cVector++ = *aVector++ * *bVector++;
339 }
340}
341#endif /* LV_HAVE_NEON */
342
343
344#ifdef LV_HAVE_ORC
345extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
346 const float* aVector,
347 const float* bVector,
348 int num_points);
349
350static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
351 const float* aVector,
352 const float* bVector,
353 unsigned int num_points)
354{
355 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
356}
357#endif /* LV_HAVE_ORC */
358
359#ifdef LV_HAVE_RVV
360#include <riscv_vector.h>
361
362static inline void volk_32f_x2_multiply_32f_rvv(float* cVector,
363 const float* aVector,
364 const float* bVector,
365 unsigned int num_points)
366{
367 size_t n = num_points;
368 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
369 vl = __riscv_vsetvl_e32m8(n);
370 vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
371 vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
372 __riscv_vse32(cVector, __riscv_vfmul(va, vb, vl), vl);
373 }
374}
375#endif /*LV_HAVE_RVV*/
376
377#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
static void volk_32f_x2_multiply_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:67
static void volk_32f_x2_multiply_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:178
static void volk_32f_x2_multiply_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:207
static void volk_32f_x2_multiply_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:282
static void volk_32f_x2_multiply_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:141
static void volk_32f_x2_multiply_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition volk_32f_x2_multiply_32f.h:320