Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32fc_multiply2_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
62
63#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
64#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
65
66#include <float.h>
67#include <inttypes.h>
68#include <stdio.h>
69#include <volk/volk_complex.h>
70
71#if LV_HAVE_AVX && LV_HAVE_FMA
72#include <immintrin.h>
73
74static inline void volk_32fc_s32fc_multiply2_32fc_u_avx_fma(lv_32fc_t* cVector,
75 const lv_32fc_t* aVector,
76 const lv_32fc_t* scalar,
77 unsigned int num_points)
78{
79 unsigned int number = 0;
80 unsigned int i = 0;
81 const unsigned int quarterPoints = num_points / 4;
82 unsigned int isodd = num_points & 3;
83 __m256 x, yl, yh, z, tmp1, tmp2;
84 lv_32fc_t* c = cVector;
85 const lv_32fc_t* a = aVector;
86
87 // Set up constant scalar vector
88 yl = _mm256_set1_ps(lv_creal(*scalar));
89 yh = _mm256_set1_ps(lv_cimag(*scalar));
90
91 for (; number < quarterPoints; number++) {
92 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
93
94 tmp1 = x;
95
96 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
97
98 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
99
100 z = _mm256_fmaddsub_ps(
101 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
102
103 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
104
105 a += 4;
106 c += 4;
107 }
108
109 for (i = num_points - isodd; i < num_points; i++) {
110 *c++ = (*a++) * (*scalar);
111 }
112}
113#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
114
115#ifdef LV_HAVE_AVX
116#include <immintrin.h>
117
119 const lv_32fc_t* aVector,
120 const lv_32fc_t* scalar,
121 unsigned int num_points)
122{
123 unsigned int number = 0;
124 unsigned int i = 0;
125 const unsigned int quarterPoints = num_points / 4;
126 unsigned int isodd = num_points & 3;
127 __m256 x, yl, yh, z, tmp1, tmp2;
128 lv_32fc_t* c = cVector;
129 const lv_32fc_t* a = aVector;
130
131 // Set up constant scalar vector
132 yl = _mm256_set1_ps(lv_creal(*scalar));
133 yh = _mm256_set1_ps(lv_cimag(*scalar));
134
135 for (; number < quarterPoints; number++) {
136 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
137
138 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
139
140 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
141
142 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
143
144 z = _mm256_addsub_ps(tmp1,
145 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
146
147 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
148
149 a += 4;
150 c += 4;
151 }
152
153 for (i = num_points - isodd; i < num_points; i++) {
154 *c++ = (*a++) * (*scalar);
155 }
156}
157#endif /* LV_HAVE_AVX */
158
159#ifdef LV_HAVE_SSE3
160#include <pmmintrin.h>
161
163 const lv_32fc_t* aVector,
164 const lv_32fc_t* scalar,
165 unsigned int num_points)
166{
167 unsigned int number = 0;
168 const unsigned int halfPoints = num_points / 2;
169
170 __m128 x, yl, yh, z, tmp1, tmp2;
171 lv_32fc_t* c = cVector;
172 const lv_32fc_t* a = aVector;
173
174 // Set up constant scalar vector
175 yl = _mm_set_ps1(lv_creal(*scalar));
176 yh = _mm_set_ps1(lv_cimag(*scalar));
177
178 for (; number < halfPoints; number++) {
179
180 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
181
182 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
183
184 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
185
186 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
187
188 z = _mm_addsub_ps(tmp1,
189 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
190
191 _mm_storeu_ps((float*)c, z); // Store the results back into the C container
192
193 a += 2;
194 c += 2;
195 }
196
197 if ((num_points % 2) != 0) {
198 *c = (*a) * (*scalar);
199 }
200}
201#endif /* LV_HAVE_SSE */
202
203#ifdef LV_HAVE_GENERIC
204
206 const lv_32fc_t* aVector,
207 const lv_32fc_t* scalar,
208 unsigned int num_points)
209{
210 lv_32fc_t* cPtr = cVector;
211 const lv_32fc_t* aPtr = aVector;
212 unsigned int number = num_points;
213
214 // unwrap loop
215 while (number >= 8) {
216 *cPtr++ = (*aPtr++) * (*scalar);
217 *cPtr++ = (*aPtr++) * (*scalar);
218 *cPtr++ = (*aPtr++) * (*scalar);
219 *cPtr++ = (*aPtr++) * (*scalar);
220 *cPtr++ = (*aPtr++) * (*scalar);
221 *cPtr++ = (*aPtr++) * (*scalar);
222 *cPtr++ = (*aPtr++) * (*scalar);
223 *cPtr++ = (*aPtr++) * (*scalar);
224 number -= 8;
225 }
226
227 // clean up any remaining
228 while (number-- > 0)
229 *cPtr++ = *aPtr++ * (*scalar);
230}
231#endif /* LV_HAVE_GENERIC */
232
233
234#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_u_H */
235#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
236#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
237
238#include <float.h>
239#include <inttypes.h>
240#include <stdio.h>
241#include <volk/volk_complex.h>
242
243#if LV_HAVE_AVX && LV_HAVE_FMA
244#include <immintrin.h>
245
246static inline void volk_32fc_s32fc_multiply2_32fc_a_avx_fma(lv_32fc_t* cVector,
247 const lv_32fc_t* aVector,
248 const lv_32fc_t* scalar,
249 unsigned int num_points)
250{
251 unsigned int number = 0;
252 unsigned int i = 0;
253 const unsigned int quarterPoints = num_points / 4;
254 unsigned int isodd = num_points & 3;
255 __m256 x, yl, yh, z, tmp1, tmp2;
256 lv_32fc_t* c = cVector;
257 const lv_32fc_t* a = aVector;
258
259 // Set up constant scalar vector
260 yl = _mm256_set1_ps(lv_creal(*scalar));
261 yh = _mm256_set1_ps(lv_cimag(*scalar));
262
263 for (; number < quarterPoints; number++) {
264 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
265
266 tmp1 = x;
267
268 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
269
270 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
271
272 z = _mm256_fmaddsub_ps(
273 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
274
275 _mm256_store_ps((float*)c, z); // Store the results back into the C container
276
277 a += 4;
278 c += 4;
279 }
280
281 for (i = num_points - isodd; i < num_points; i++) {
282 *c++ = (*a++) * (*scalar);
283 }
284}
285#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
286
287
288#ifdef LV_HAVE_AVX
289#include <immintrin.h>
290
292 const lv_32fc_t* aVector,
293 const lv_32fc_t* scalar,
294 unsigned int num_points)
295{
296 unsigned int number = 0;
297 unsigned int i = 0;
298 const unsigned int quarterPoints = num_points / 4;
299 unsigned int isodd = num_points & 3;
300 __m256 x, yl, yh, z, tmp1, tmp2;
301 lv_32fc_t* c = cVector;
302 const lv_32fc_t* a = aVector;
303
304 // Set up constant scalar vector
305 yl = _mm256_set1_ps(lv_creal(*scalar));
306 yh = _mm256_set1_ps(lv_cimag(*scalar));
307
308 for (; number < quarterPoints; number++) {
309 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
310
311 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
312
313 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
314
315 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
316
317 z = _mm256_addsub_ps(tmp1,
318 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
319
320 _mm256_store_ps((float*)c, z); // Store the results back into the C container
321
322 a += 4;
323 c += 4;
324 }
325
326 for (i = num_points - isodd; i < num_points; i++) {
327 *c++ = (*a++) * (*scalar);
328 }
329}
330#endif /* LV_HAVE_AVX */
331
332#ifdef LV_HAVE_SSE3
333#include <pmmintrin.h>
334
336 const lv_32fc_t* aVector,
337 const lv_32fc_t* scalar,
338 unsigned int num_points)
339{
340 unsigned int number = 0;
341 const unsigned int halfPoints = num_points / 2;
342
343 __m128 x, yl, yh, z, tmp1, tmp2;
344 lv_32fc_t* c = cVector;
345 const lv_32fc_t* a = aVector;
346
347 // Set up constant scalar vector
348 yl = _mm_set_ps1(lv_creal(*scalar));
349 yh = _mm_set_ps1(lv_cimag(*scalar));
350
351 for (; number < halfPoints; number++) {
352
353 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
354
355 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
356
357 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
358
359 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
360
361 z = _mm_addsub_ps(tmp1,
362 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
363
364 _mm_store_ps((float*)c, z); // Store the results back into the C container
365
366 a += 2;
367 c += 2;
368 }
369
370 if ((num_points % 2) != 0) {
371 *c = (*a) * (*scalar);
372 }
373}
374#endif /* LV_HAVE_SSE */
375
376#ifdef LV_HAVE_NEON
377#include <arm_neon.h>
378
380 const lv_32fc_t* aVector,
381 const lv_32fc_t* scalar,
382 unsigned int num_points)
383{
384 lv_32fc_t* cPtr = cVector;
385 const lv_32fc_t* aPtr = aVector;
386 unsigned int number = num_points;
387 unsigned int quarter_points = num_points / 4;
388
389 float32x4x2_t a_val, scalar_val;
390 float32x4x2_t tmp_imag;
391
392 scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
393 scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((float*)aPtr);
396 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
397 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
398
399 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
400 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
401
402 vst2q_f32((float*)cPtr, tmp_imag);
403 aPtr += 4;
404 cPtr += 4;
405 }
406
407 for (number = quarter_points * 4; number < num_points; number++) {
408 *cPtr++ = *aPtr++ * (*scalar);
409 }
410}
411#endif /* LV_HAVE_NEON */
412
413#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_a_H */
static void volk_32fc_s32fc_multiply2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:205
static void volk_32fc_s32fc_multiply2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:335
static void volk_32fc_s32fc_multiply2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:118
static void volk_32fc_s32fc_multiply2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:162
static void volk_32fc_s32fc_multiply2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:291
static void volk_32fc_s32fc_multiply2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_s32fc_multiply2_32fc.h:379
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13