Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
65
66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68
69#include <volk/volk_complex.h>
70
71
72static inline void calculate_scaled_distances(float* target,
73 const lv_32fc_t symbol,
74 const lv_32fc_t* points,
75 const float scalar,
76 const unsigned int num_points)
77{
78 lv_32fc_t diff;
79 for (unsigned int i = 0; i < num_points; ++i) {
80 /*
81 * Calculate: |y - x|^2 * SNR_lin
82 * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83 */
84 diff = symbol - *points++;
85 *target++ =
86 scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87 }
88}
89
90
91#ifdef LV_HAVE_AVX2
92#include <immintrin.h>
94
95static inline void
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97 const lv_32fc_t* src0,
98 const lv_32fc_t* points,
99 float scalar,
100 unsigned int num_points)
101{
102 const unsigned int num_bytes = num_points * 8;
103 __m128 xmm9, xmm10;
104 __m256 xmm4, xmm6;
105 __m256 xmm_points0, xmm_points1, xmm_result;
106
107 const unsigned int bound = num_bytes >> 6;
108
109 // load complex value into all parts of the register.
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112
113 // Load scalar into all 8 parts of the register
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116
117 // Set permutation constant
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119
120 for (unsigned int i = 0; i < bound; ++i) {
121 xmm_points0 = _mm256_load_ps((float*)points);
122 xmm_points1 = _mm256_load_ps((float*)(points + 4));
123 points += 8;
124 __VOLK_PREFETCH(points);
125
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128
129 _mm256_store_ps(target, xmm_result);
130 target += 8;
131 }
132
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((float*)points);
135
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137
138 points += 4;
139
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
141
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
149 target += 4;
150 }
151
152 if (num_bytes >> 4 & 1) {
153 xmm9 = _mm_load_ps((float*)points);
154
155 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156
157 points += 2;
158
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
160
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162
163 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164
165 _mm_storeh_pi((__m64*)target, xmm10);
166 target += 2;
167 }
168
169 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170}
171
172#endif /*LV_HAVE_AVX2*/
173
174
175#ifdef LV_HAVE_AVX
176#include <immintrin.h>
178
179static inline void
181 const lv_32fc_t* src0,
182 const lv_32fc_t* points,
183 float scalar,
184 unsigned int num_points)
185{
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
188
189 __m256 xmm_points0, xmm_points1, xmm_result;
190
191 // load complex value into all parts of the register.
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193
194 // Load scalar into all 8 parts of the register
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196
197 for (int i = 0; i < eightsPoints; ++i) {
198 xmm_points0 = _mm256_load_ps((float*)points);
199 xmm_points1 = _mm256_load_ps((float*)(points + 4));
200 points += 8;
201
202 xmm_result = _mm256_scaled_norm_dist_ps(
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204
205 _mm256_store_ps(target, xmm_result);
206 target += 8;
207 }
208
209 const lv_32fc_t symbol = *src0;
210 calculate_scaled_distances(target, symbol, points, scalar, remainder);
211}
212
213#endif /* LV_HAVE_AVX */
214
215
216#ifdef LV_HAVE_SSE3
217#include <pmmintrin.h>
219
220static inline void
222 const lv_32fc_t* src0,
223 const lv_32fc_t* points,
224 float scalar,
225 unsigned int num_points)
226{
227 __m128 xmm_points0, xmm_points1, xmm_result;
228
229 /*
230 * First do 4 values in every loop iteration.
231 * There may be up to 3 values left.
232 * leftovers0 indicates if at least 2 more are available for SSE execution.
233 * leftovers1 indicates if there is a single element left.
234 */
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
238
239 // load complex value into both parts of the register.
240 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241
242 // Load scalar into all 4 parts of the register
243 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244
245 for (int i = 0; i < quarterPoints; ++i) {
246 xmm_points0 = _mm_load_ps((float*)points);
247 xmm_points1 = _mm_load_ps((float*)(points + 2));
248 points += 4;
249 __VOLK_PREFETCH(points);
250 // calculate distances
251 xmm_result = _mm_scaled_norm_dist_ps_sse3(
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253
254 _mm_store_ps(target, xmm_result);
255 target += 4;
256 }
257
258 for (int i = 0; i < leftovers0; ++i) {
259 xmm_points0 = _mm_load_ps((float*)points);
260 points += 2;
261
262 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266
267 _mm_storeh_pi((__m64*)target, xmm_result);
268 target += 2;
269 }
270
271 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272}
273
274#endif /*LV_HAVE_SSE3*/
275
276#ifdef LV_HAVE_SSE
278#include <xmmintrin.h>
279static inline void
281 const lv_32fc_t* src0,
282 const lv_32fc_t* points,
283 float scalar,
284 unsigned int num_points)
285{
286 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288
289 for (unsigned i = 0; i < num_points / 4; ++i) {
290 __m128 xmm_points0 = _mm_load_ps((float*)points);
291 __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292 points += 4;
293 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((float*)target, xmm_result);
296 target += 4;
297 }
298
299 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300}
301#endif // LV_HAVE_SSE
302
303#ifdef LV_HAVE_GENERIC
304static inline void
306 const lv_32fc_t* src0,
307 const lv_32fc_t* points,
308 float scalar,
309 unsigned int num_points)
310{
311 const lv_32fc_t symbol = *src0;
312 calculate_scaled_distances(target, symbol, points, scalar, num_points);
313}
314
315#endif /*LV_HAVE_GENERIC*/
316
317
318#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
319
320#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
322
323#include <volk/volk_complex.h>
324
325
326#ifdef LV_HAVE_AVX2
327#include <immintrin.h>
329
330static inline void
331volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
332 const lv_32fc_t* src0,
333 const lv_32fc_t* points,
334 float scalar,
335 unsigned int num_points)
336{
337 const unsigned int num_bytes = num_points * 8;
338 __m128 xmm9, xmm10;
339 __m256 xmm4, xmm6;
340 __m256 xmm_points0, xmm_points1, xmm_result;
341
342 const unsigned int bound = num_bytes >> 6;
343
344 // load complex value into all parts of the register.
345 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
346 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
347
348 // Load scalar into all 8 parts of the register
349 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
351
352 // Set permutation constant
353 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
354
355 for (unsigned int i = 0; i < bound; ++i) {
356 xmm_points0 = _mm256_loadu_ps((float*)points);
357 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
358 points += 8;
359 __VOLK_PREFETCH(points);
360
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
363
364 _mm256_storeu_ps(target, xmm_result);
365 target += 8;
366 }
367
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((float*)points);
370
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
372
373 points += 4;
374
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
376
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
379
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
381
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383 _mm_storeu_ps(target, xmm9);
384 target += 4;
385 }
386
387 if (num_bytes >> 4 & 1) {
388 xmm9 = _mm_loadu_ps((float*)points);
389
390 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
391
392 points += 2;
393
394 xmm9 = _mm_mul_ps(xmm10, xmm10);
395
396 xmm10 = _mm_hadd_ps(xmm9, xmm9);
397
398 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
399
400 _mm_storeh_pi((__m64*)target, xmm10);
401 target += 2;
402 }
403
404 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
405}
406
407#endif /*LV_HAVE_AVX2*/
408
409
410#ifdef LV_HAVE_AVX
411#include <immintrin.h>
413
414static inline void
416 const lv_32fc_t* src0,
417 const lv_32fc_t* points,
418 float scalar,
419 unsigned int num_points)
420{
421 const int eightsPoints = num_points / 8;
422 const int remainder = num_points - 8 * eightsPoints;
423
424 __m256 xmm_points0, xmm_points1, xmm_result;
425
426 // load complex value into all parts of the register.
427 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
428
429 // Load scalar into all 8 parts of the register
430 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
431
432 for (int i = 0; i < eightsPoints; ++i) {
433 xmm_points0 = _mm256_loadu_ps((float*)points);
434 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
435 points += 8;
436
437 xmm_result = _mm256_scaled_norm_dist_ps(
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
439
440 _mm256_storeu_ps(target, xmm_result);
441 target += 8;
442 }
443
444 const lv_32fc_t symbol = *src0;
445 calculate_scaled_distances(target, symbol, points, scalar, remainder);
446}
447
448#endif /* LV_HAVE_AVX */
449
450
451#ifdef LV_HAVE_SSE3
452#include <pmmintrin.h>
454
455static inline void
457 const lv_32fc_t* src0,
458 const lv_32fc_t* points,
459 float scalar,
460 unsigned int num_points)
461{
462 __m128 xmm_points0, xmm_points1, xmm_result;
463
464 /*
465 * First do 4 values in every loop iteration.
466 * There may be up to 3 values left.
467 * leftovers0 indicates if at least 2 more are available for SSE execution.
468 * leftovers1 indicates if there is a single element left.
469 */
470 const int quarterPoints = num_points / 4;
471 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 const int leftovers1 = num_points % 2;
473
474 // load complex value into both parts of the register.
475 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
476
477 // Load scalar into all 4 parts of the register
478 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
479
480 for (int i = 0; i < quarterPoints; ++i) {
481 xmm_points0 = _mm_loadu_ps((float*)points);
482 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
483 points += 4;
484 __VOLK_PREFETCH(points);
485 // calculate distances
486 xmm_result = _mm_scaled_norm_dist_ps_sse3(
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
488
489 _mm_storeu_ps(target, xmm_result);
490 target += 4;
491 }
492
493 for (int i = 0; i < leftovers0; ++i) {
494 xmm_points0 = _mm_loadu_ps((float*)points);
495 points += 2;
496
497 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
498 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
499 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
500 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
501
502 _mm_storeh_pi((__m64*)target, xmm_result);
503 target += 2;
504 }
505
506 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
507}
508
509#endif /*LV_HAVE_SSE3*/
510
511#ifdef LV_HAVE_SSE
513#include <xmmintrin.h>
514static inline void
516 const lv_32fc_t* src0,
517 const lv_32fc_t* points,
518 float scalar,
519 unsigned int num_points)
520{
521 const __m128 xmm_scalar = _mm_set1_ps(scalar);
522 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
523
524 for (unsigned i = 0; i < num_points / 4; ++i) {
525 __m128 xmm_points0 = _mm_loadu_ps((float*)points);
526 __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
527 points += 4;
528 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
530 _mm_storeu_ps((float*)target, xmm_result);
531 target += 4;
532 }
533
534 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
535}
536#endif // LV_HAVE_SSE
537
538#ifdef LV_HAVE_RVV
539#include <riscv_vector.h>
540
541static inline void
542volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target,
543 const lv_32fc_t* src0,
544 const lv_32fc_t* points,
545 float scalar,
546 unsigned int num_points)
547{
548 size_t vlmax = __riscv_vsetvlmax_e32m4();
549 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
550 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
551 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
552
553 size_t n = num_points;
554 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
555 vl = __riscv_vsetvl_e32m4(n);
556 vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
557 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
558 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
559 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
560 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
561 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
562 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
563 }
564}
565#endif /*LV_HAVE_RVV*/
566
567#ifdef LV_HAVE_RVVSEG
568#include <riscv_vector.h>
569
570static inline void
571volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target,
572 const lv_32fc_t* src0,
573 const lv_32fc_t* points,
574 float scalar,
575 unsigned int num_points)
576{
577 size_t vlmax = __riscv_vsetvlmax_e32m4();
578 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
579 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
580 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
581
582 size_t n = num_points;
583 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
584 vl = __riscv_vsetvl_e32m4(n);
585 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
586 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
587 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
588 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
589 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
590 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
591 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
592 }
593}
594#endif /*LV_HAVE_RVVSEG*/
595
596#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:456
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:305
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:180
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:515
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:280
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:72
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:221
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx2_intrinsics.h:107
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx_intrinsics.h:113
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition volk_sse3_intrinsics.h:50
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition volk_sse_intrinsics.h:74