Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
73
74#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
75#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
76
77#include <float.h>
78#include <inttypes.h>
79#include <stdio.h>
80#include <volk/volk_complex.h>
81
82
83#ifdef LV_HAVE_GENERIC
84
85static inline void
87 const lv_32fc_t* aVector,
88 const lv_32fc_t* bVector,
89 const lv_32fc_t* scalar,
90 unsigned int num_points)
91{
92 const lv_32fc_t* aPtr = aVector;
93 const lv_32fc_t* bPtr = bVector;
94 lv_32fc_t* cPtr = cVector;
95 unsigned int number = num_points;
96
97 // unwrap loop
98 while (number >= 8) {
99 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
100 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
101 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
102 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
103 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
104 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
105 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
106 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
107 number -= 8;
108 }
109
110 // clean up any remaining
111 while (number-- > 0) {
112 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
113 }
114}
115#endif /* LV_HAVE_GENERIC */
116
117
118#ifdef LV_HAVE_AVX
119#include <immintrin.h>
121
122static inline void
124 const lv_32fc_t* aVector,
125 const lv_32fc_t* bVector,
126 const lv_32fc_t* scalar,
127 unsigned int num_points)
128{
129 unsigned int number = 0;
130 unsigned int i = 0;
131 const unsigned int quarterPoints = num_points / 4;
132 unsigned int isodd = num_points & 3;
133
134 __m256 x, y, s, z;
135 lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
136
137 const lv_32fc_t* a = aVector;
138 const lv_32fc_t* b = bVector;
139 lv_32fc_t* c = cVector;
140
141 // Set up constant scalar vector
142 s = _mm256_loadu_ps((float*)v_scalar);
143
144 for (; number < quarterPoints; number++) {
145 x = _mm256_loadu_ps((float*)b);
146 y = _mm256_loadu_ps((float*)a);
148 z = _mm256_add_ps(y, z);
149 _mm256_storeu_ps((float*)c, z);
150
151 a += 4;
152 b += 4;
153 c += 4;
154 }
155
156 for (i = num_points - isodd; i < num_points; i++) {
157 *c++ = (*a++) + lv_conj(*b++) * (*scalar);
158 }
159}
160#endif /* LV_HAVE_AVX */
161
162
163#ifdef LV_HAVE_SSE3
164#include <pmmintrin.h>
166
167static inline void
169 const lv_32fc_t* aVector,
170 const lv_32fc_t* bVector,
171 const lv_32fc_t* scalar,
172 unsigned int num_points)
173{
174 unsigned int number = 0;
175 const unsigned int halfPoints = num_points / 2;
176
177 __m128 x, y, s, z;
178 lv_32fc_t v_scalar[2] = { *scalar, *scalar };
179
180 const lv_32fc_t* a = aVector;
181 const lv_32fc_t* b = bVector;
182 lv_32fc_t* c = cVector;
183
184 // Set up constant scalar vector
185 s = _mm_loadu_ps((float*)v_scalar);
186
187 for (; number < halfPoints; number++) {
188 x = _mm_loadu_ps((float*)b);
189 y = _mm_loadu_ps((float*)a);
191 z = _mm_add_ps(y, z);
192 _mm_storeu_ps((float*)c, z);
193
194 a += 2;
195 b += 2;
196 c += 2;
197 }
198
199 if ((num_points % 2) != 0) {
200 *c = *a + lv_conj(*b) * (*scalar);
201 }
202}
203#endif /* LV_HAVE_SSE */
204
205
206#ifdef LV_HAVE_AVX
207#include <immintrin.h>
209
210static inline void
212 const lv_32fc_t* aVector,
213 const lv_32fc_t* bVector,
214 const lv_32fc_t* scalar,
215 unsigned int num_points)
216{
217 unsigned int number = 0;
218 unsigned int i = 0;
219 const unsigned int quarterPoints = num_points / 4;
220 unsigned int isodd = num_points & 3;
221
222 __m256 x, y, s, z;
223 lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
224
225 const lv_32fc_t* a = aVector;
226 const lv_32fc_t* b = bVector;
227 lv_32fc_t* c = cVector;
228
229 // Set up constant scalar vector
230 s = _mm256_loadu_ps((float*)v_scalar);
231
232 for (; number < quarterPoints; number++) {
233 x = _mm256_load_ps((float*)b);
234 y = _mm256_load_ps((float*)a);
236 z = _mm256_add_ps(y, z);
237 _mm256_store_ps((float*)c, z);
238
239 a += 4;
240 b += 4;
241 c += 4;
242 }
243
244 for (i = num_points - isodd; i < num_points; i++) {
245 *c++ = (*a++) + lv_conj(*b++) * (*scalar);
246 }
247}
248#endif /* LV_HAVE_AVX */
249
250
251#ifdef LV_HAVE_SSE3
252#include <pmmintrin.h>
254
255static inline void
257 const lv_32fc_t* aVector,
258 const lv_32fc_t* bVector,
259 const lv_32fc_t* scalar,
260 unsigned int num_points)
261{
262 unsigned int number = 0;
263 const unsigned int halfPoints = num_points / 2;
264
265 __m128 x, y, s, z;
266 lv_32fc_t v_scalar[2] = { *scalar, *scalar };
267
268 const lv_32fc_t* a = aVector;
269 const lv_32fc_t* b = bVector;
270 lv_32fc_t* c = cVector;
271
272 // Set up constant scalar vector
273 s = _mm_loadu_ps((float*)v_scalar);
274
275 for (; number < halfPoints; number++) {
276 x = _mm_load_ps((float*)b);
277 y = _mm_load_ps((float*)a);
279 z = _mm_add_ps(y, z);
280 _mm_store_ps((float*)c, z);
281
282 a += 2;
283 b += 2;
284 c += 2;
285 }
286
287 if ((num_points % 2) != 0) {
288 *c = *a + lv_conj(*b) * (*scalar);
289 }
290}
291#endif /* LV_HAVE_SSE */
292
293
294#ifdef LV_HAVE_NEON
295#include <arm_neon.h>
296
297static inline void
299 const lv_32fc_t* aVector,
300 const lv_32fc_t* bVector,
301 const lv_32fc_t* scalar,
302 unsigned int num_points)
303{
304 const lv_32fc_t* bPtr = bVector;
305 const lv_32fc_t* aPtr = aVector;
306 lv_32fc_t* cPtr = cVector;
307 unsigned int number = num_points;
308 unsigned int quarter_points = num_points / 4;
309
310 float32x4x2_t a_val, b_val, c_val, scalar_val;
311 float32x4x2_t tmp_val;
312
313 scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
314 scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
315
316 for (number = 0; number < quarter_points; ++number) {
317 a_val = vld2q_f32((float*)aPtr);
318 b_val = vld2q_f32((float*)bPtr);
319 b_val.val[1] = vnegq_f32(b_val.val[1]);
320 __VOLK_PREFETCH(aPtr + 8);
321 __VOLK_PREFETCH(bPtr + 8);
322
323 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
324 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
325
326 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
327 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
328
329 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
330 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
331
332 vst2q_f32((float*)cPtr, c_val);
333
334 aPtr += 4;
335 bPtr += 4;
336 cPtr += 4;
337 }
338
339 for (number = quarter_points * 4; number < num_points; number++) {
340 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
341 }
342}
343#endif /* LV_HAVE_NEON */
344
345#ifdef LV_HAVE_RVV
346#include <riscv_vector.h>
347
348static inline void
349volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvv(lv_32fc_t* cVector,
350 const lv_32fc_t* aVector,
351 const lv_32fc_t* bVector,
352 const lv_32fc_t* scalar,
353 unsigned int num_points)
354{
355 vfloat32m2_t vbr =
356 __riscv_vfmv_v_f_f32m2(lv_creal(*scalar), __riscv_vsetvlmax_e32m2());
357 vfloat32m2_t vbi =
358 __riscv_vfmv_v_f_f32m2(lv_cimag(*scalar), __riscv_vsetvlmax_e32m2());
359 size_t n = num_points;
360 for (size_t vl; n > 0; n -= vl, bVector += vl, aVector += vl, cVector += vl) {
361 vl = __riscv_vsetvl_e32m2(n);
362 vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)bVector, vl);
363 vuint64m4_t vc = __riscv_vle64_v_u64m4((const uint64_t*)aVector, vl);
364 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
365 vfloat32m2_t vcr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 0, vl));
366 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
367 vfloat32m2_t vci = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vc, 32, vl));
368 vfloat32m2_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
369 vfloat32m2_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
370 vuint32m2_t vru = __riscv_vreinterpret_u32m2(__riscv_vfadd(vr, vcr, vl));
371 vuint32m2_t viu = __riscv_vreinterpret_u32m2(__riscv_vfadd(vi, vci, vl));
372 vuint64m4_t v =
373 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
374 __riscv_vse64((uint64_t*)cVector, v, vl);
375 }
376}
377#endif /*LV_HAVE_RVV*/
378
379#ifdef LV_HAVE_RVVSEG
380#include <riscv_vector.h>
381
382static inline void
383volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_rvvseg(lv_32fc_t* cVector,
384 const lv_32fc_t* aVector,
385 const lv_32fc_t* bVector,
386 const lv_32fc_t* scalar,
387 unsigned int num_points)
388{
389 vfloat32m4_t vbr =
390 __riscv_vfmv_v_f_f32m4(lv_creal(*scalar), __riscv_vsetvlmax_e32m4());
391 vfloat32m4_t vbi =
392 __riscv_vfmv_v_f_f32m4(lv_cimag(*scalar), __riscv_vsetvlmax_e32m4());
393 size_t n = num_points;
394 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
395 vl = __riscv_vsetvl_e32m4(n);
396 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
397 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
398 vfloat32m4_t vcr = __riscv_vget_f32m4(vc, 0), vci = __riscv_vget_f32m4(vc, 1);
399 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
400 vfloat32m4_t vr = __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
401 vfloat32m4_t vi = __riscv_vfnmsac(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
402 vr = __riscv_vfadd(vr, vcr, vl);
403 vi = __riscv_vfadd(vi, vci, vl);
404 __riscv_vsseg2e32_v_f32m4x2(
405 (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
406 }
407}
408#endif /*LV_HAVE_RVVSEG*/
409
410#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:86
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:298
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:168
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:123
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:211
static void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t *scalar, unsigned int num_points)
Definition volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h:256
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_conj(x)
Definition volk_complex.h:100
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition volk_sse3_intrinsics.h:31