Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
96
97#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
98#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
99
100#include <float.h>
101#include <inttypes.h>
102#include <volk/volk_complex.h>
103
104
105#ifdef LV_HAVE_GENERIC
106
107static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
108 const lv_32fc_t* aVector,
109 const lv_32fc_t* bVector,
110 unsigned int num_points)
111{
112 lv_32fc_t* cPtr = cVector;
113 const lv_32fc_t* aPtr = aVector;
114 const lv_32fc_t* bPtr = bVector;
115
116 for (unsigned int number = 0; number < num_points; number++) {
117 *cPtr++ = (*aPtr++) / (*bPtr++);
118 }
119}
120#endif /* LV_HAVE_GENERIC */
121
122
123#ifdef LV_HAVE_SSE3
124#include <pmmintrin.h>
126
127static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
128 const lv_32fc_t* numeratorVector,
129 const lv_32fc_t* denumeratorVector,
130 unsigned int num_points)
131{
132 /*
133 * we'll do the "classical"
134 * a a b*
135 * --- = -------
136 * b |b|^2
137 * */
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
140
141 __m128 num01, num23, den01, den23, norm, result;
142 lv_32fc_t* c = cVector;
143 const lv_32fc_t* a = numeratorVector;
144 const lv_32fc_t* b = denumeratorVector;
145
146 for (; number < quarterPoints; number++) {
147 num01 = _mm_loadu_ps((float*)a); // first pair
148 den01 = _mm_loadu_ps((float*)b); // first pair
149 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
150 a += 2;
151 b += 2;
152
153 num23 = _mm_loadu_ps((float*)a); // second pair
154 den23 = _mm_loadu_ps((float*)b); // second pair
155 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
156 a += 2;
157 b += 2;
158
159 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
160 den01 = _mm_unpacklo_ps(norm, norm);
161 den23 = _mm_unpackhi_ps(norm, norm);
162
163 result = _mm_div_ps(num01, den01);
164 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
165 c += 2;
166 result = _mm_div_ps(num23, den23);
167 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
168 c += 2;
169 }
170
171 number *= 4;
172 for (; number < num_points; number++) {
173 *c = (*a) / (*b);
174 a++;
175 b++;
176 c++;
177 }
178}
179#endif /* LV_HAVE_SSE3 */
180
181
182#ifdef LV_HAVE_AVX
183#include <immintrin.h>
185
186static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
187 const lv_32fc_t* numeratorVector,
188 const lv_32fc_t* denumeratorVector,
189 unsigned int num_points)
190{
191 /*
192 * we'll do the "classical"
193 * a a b*
194 * --- = -------
195 * b |b|^2
196 * */
197 unsigned int number = 0;
198 const unsigned int quarterPoints = num_points / 4;
199
200 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
201 lv_32fc_t* c = cVector;
202 const lv_32fc_t* a = numeratorVector;
203 const lv_32fc_t* b = denumeratorVector;
204
205 for (; number < quarterPoints; number++) {
206 num = _mm256_loadu_ps(
207 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
208 denum = _mm256_loadu_ps(
209 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
210 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
211 sq = _mm256_mul_ps(denum, denum); // Square the values
212 mag_sq_un = _mm256_hadd_ps(
213 sq, sq); // obtain the actual squared magnitude, although out of order
214 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
215 // best guide I found on using these functions:
216 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
217 div = _mm256_div_ps(mul_conj, mag_sq);
218
219 _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
220
221 a += 4;
222 b += 4;
223 c += 4;
224 }
225
226 number = quarterPoints * 4;
227
228 for (; number < num_points; number++) {
229 *c++ = (*a++) / (*b++);
230 }
231}
232#endif /* LV_HAVE_AVX */
233
234
235#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
236
237
238#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
239#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
240
241#include <float.h>
242#include <inttypes.h>
243#include <stdio.h>
244#include <volk/volk_complex.h>
245
246#ifdef LV_HAVE_SSE3
247#include <pmmintrin.h>
249
250static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
251 const lv_32fc_t* numeratorVector,
252 const lv_32fc_t* denumeratorVector,
253 unsigned int num_points)
254{
255 /*
256 * we'll do the "classical"
257 * a a b*
258 * --- = -------
259 * b |b|^2
260 * */
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
263
264 __m128 num01, num23, den01, den23, norm, result;
265 lv_32fc_t* c = cVector;
266 const lv_32fc_t* a = numeratorVector;
267 const lv_32fc_t* b = denumeratorVector;
268
269 for (; number < quarterPoints; number++) {
270 num01 = _mm_load_ps((float*)a); // first pair
271 den01 = _mm_load_ps((float*)b); // first pair
272 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
273 a += 2;
274 b += 2;
275
276 num23 = _mm_load_ps((float*)a); // second pair
277 den23 = _mm_load_ps((float*)b); // second pair
278 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
279 a += 2;
280 b += 2;
281
282 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
283
284 den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
285 den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
286
287 result = _mm_div_ps(num01, den01);
288 _mm_store_ps((float*)c, result); // Store the results back into the C container
289 c += 2;
290 result = _mm_div_ps(num23, den23);
291 _mm_store_ps((float*)c, result); // Store the results back into the C container
292 c += 2;
293 }
294
295 number *= 4;
296 for (; number < num_points; number++) {
297 *c = (*a) / (*b);
298 a++;
299 b++;
300 c++;
301 }
302}
303#endif /* LV_HAVE_SSE */
304
305#ifdef LV_HAVE_AVX
306#include <immintrin.h>
308
309static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
310 const lv_32fc_t* numeratorVector,
311 const lv_32fc_t* denumeratorVector,
312 unsigned int num_points)
313{
314 /*
315 * Guide to AVX intrisics:
316 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
317 *
318 * we'll do the "classical"
319 * a a b*
320 * --- = -------
321 * b |b|^2
322 *
323 */
324 lv_32fc_t* c = cVector;
325 const lv_32fc_t* a = numeratorVector;
326 const lv_32fc_t* b = denumeratorVector;
327
328 const unsigned int eigthPoints = num_points / 8;
329
330 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
331
332 for (unsigned int number = 0; number < eigthPoints; number++) {
333 // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
334 num01 = _mm256_load_ps((float*)a);
335 denum01 = _mm256_load_ps((float*)b);
336
337 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
338 a += 4;
339 b += 4;
340
341 num23 = _mm256_load_ps((float*)a);
342 denum23 = _mm256_load_ps((float*)b);
343 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
344 a += 4;
345 b += 4;
346
347 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
348 _mm256_mul_ps(denum23, denum23));
349
350 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
351 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
352
353 result0 = _mm256_div_ps(num01, denum01);
354 result1 = _mm256_div_ps(num23, denum23);
355
356 _mm256_store_ps((float*)c, result0);
357 c += 4;
358 _mm256_store_ps((float*)c, result1);
359 c += 4;
360 }
361
362 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
363}
364#endif /* LV_HAVE_AVX */
365
366
367#ifdef LV_HAVE_NEON
368#include <arm_neon.h>
369
370static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
371 const lv_32fc_t* aVector,
372 const lv_32fc_t* bVector,
373 unsigned int num_points)
374{
375 lv_32fc_t* cPtr = cVector;
376 const lv_32fc_t* aPtr = aVector;
377 const lv_32fc_t* bPtr = bVector;
378
379 float32x4x2_t aVal, bVal, cVal;
380 float32x4_t bAbs, bAbsInv;
381
382 const unsigned int quarterPoints = num_points / 4;
383 unsigned int number = 0;
384 for (; number < quarterPoints; number++) {
385 aVal = vld2q_f32((const float*)(aPtr));
386 bVal = vld2q_f32((const float*)(bPtr));
387 aPtr += 4;
388 bPtr += 4;
389 __VOLK_PREFETCH(aPtr + 4);
390 __VOLK_PREFETCH(bPtr + 4);
391
392 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
393 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
394
395 bAbsInv = vrecpeq_f32(bAbs);
396 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
397 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
398
399 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
400 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
401 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
402
403 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
404 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
405 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
406
407 vst2q_f32((float*)(cPtr), cVal);
408 cPtr += 4;
409 }
410
411 for (number = quarterPoints * 4; number < num_points; number++) {
412 *cPtr++ = (*aPtr++) / (*bPtr++);
413 }
414}
415#endif /* LV_HAVE_NEON */
416
417#ifdef LV_HAVE_RVV
418#include <riscv_vector.h>
419
420
421static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector,
422 const lv_32fc_t* aVector,
423 const lv_32fc_t* bVector,
424 unsigned int num_points)
425{
426 uint64_t* out = (uint64_t*)cVector;
427 size_t n = num_points;
428 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
429 vl = __riscv_vsetvl_e32m4(n);
430 vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
431 vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
432 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
433 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
434 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
435 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
436 vfloat32m4_t mul = __riscv_vfrdiv(
437 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
438 vfloat32m4_t vr = __riscv_vfmul(
439 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
440 vfloat32m4_t vi = __riscv_vfmul(
441 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
442 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
443 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
444 vuint64m8_t v =
445 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
446 __riscv_vse64(out, v, vl);
447 }
448}
449#endif /*LV_HAVE_RVV*/
450
451#ifdef LV_HAVE_RVVSEG
452#include <riscv_vector.h>
453
454static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector,
455 const lv_32fc_t* aVector,
456 const lv_32fc_t* bVector,
457 unsigned int num_points)
458{
459 size_t n = num_points;
460 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
461 vl = __riscv_vsetvl_e32m4(n);
462 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
463 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
464 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
465 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
466 vfloat32m4_t mul = __riscv_vfrdiv(
467 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
468 vfloat32m4_t vr = __riscv_vfmul(
469 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
470 vfloat32m4_t vi = __riscv_vfmul(
471 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
472 __riscv_vsseg2e32_v_f32m4x2(
473 (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
474 }
475}
476
477#endif /*LV_HAVE_RVVSEG*/
478
479#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:250
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:107
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:370
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:186
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:309
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition volk_32fc_x2_divide_32fc.h:127
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition volk_avx_intrinsics.h:76
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
float complex lv_32fc_t
Definition volk_complex.h:74
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition volk_sse3_intrinsics.h:38
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition volk_sse3_intrinsics.h:31