66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
76 const unsigned int num_points)
79 for (
unsigned int i = 0;
i < num_points; ++
i) {
84 diff = symbol - *points++;
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
100 unsigned int num_points)
102 const unsigned int num_bytes = num_points * 8;
105 __m256 xmm_points0, xmm_points1, xmm_result;
107 const unsigned int bound = num_bytes >> 6;
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
120 for (
unsigned int i = 0;
i < bound; ++
i) {
121 xmm_points0 = _mm256_load_ps((
float*)points);
122 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
129 _mm256_store_ps(target, xmm_result);
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((
float*)points);
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
152 if (num_bytes >> 4 & 1) {
153 xmm9 = _mm_load_ps((
float*)points);
155 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
163 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
165 _mm_storeh_pi((__m64*)target, xmm10);
176#include <immintrin.h>
184 unsigned int num_points)
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
189 __m256 xmm_points0, xmm_points1, xmm_result;
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
197 for (
int i = 0;
i < eightsPoints; ++
i) {
198 xmm_points0 = _mm256_load_ps((
float*)points);
199 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
205 _mm256_store_ps(target, xmm_result);
217#include <pmmintrin.h>
225 unsigned int num_points)
227 __m128 xmm_points0, xmm_points1, xmm_result;
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
240 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
243 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
245 for (
int i = 0;
i < quarterPoints; ++
i) {
246 xmm_points0 = _mm_load_ps((
float*)points);
247 xmm_points1 = _mm_load_ps((
float*)(points + 2));
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
254 _mm_store_ps(target, xmm_result);
258 for (
int i = 0;
i < leftovers0; ++
i) {
259 xmm_points0 = _mm_load_ps((
float*)points);
262 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
267 _mm_storeh_pi((__m64*)target, xmm_result);
278#include <xmmintrin.h>
284 unsigned int num_points)
286 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
289 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
290 __m128 xmm_points0 = _mm_load_ps((
float*)points);
291 __m128 xmm_points1 = _mm_load_ps((
float*)(points + 2));
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((
float*)target, xmm_result);
303#ifdef LV_HAVE_GENERIC
309 unsigned int num_points)
320#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
327#include <immintrin.h>
331volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
335 unsigned int num_points)
337 const unsigned int num_bytes = num_points * 8;
340 __m256 xmm_points0, xmm_points1, xmm_result;
342 const unsigned int bound = num_bytes >> 6;
345 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
346 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
349 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
353 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
355 for (
unsigned int i = 0;
i < bound; ++
i) {
356 xmm_points0 = _mm256_loadu_ps((
float*)points);
357 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
364 _mm256_storeu_ps(target, xmm_result);
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((
float*)points);
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383 _mm_storeu_ps(target, xmm9);
387 if (num_bytes >> 4 & 1) {
388 xmm9 = _mm_loadu_ps((
float*)points);
390 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
394 xmm9 = _mm_mul_ps(xmm10, xmm10);
396 xmm10 = _mm_hadd_ps(xmm9, xmm9);
398 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
400 _mm_storeh_pi((__m64*)target, xmm10);
411#include <immintrin.h>
419 unsigned int num_points)
421 const int eightsPoints = num_points / 8;
422 const int remainder = num_points - 8 * eightsPoints;
424 __m256 xmm_points0, xmm_points1, xmm_result;
427 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
430 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
432 for (
int i = 0;
i < eightsPoints; ++
i) {
433 xmm_points0 = _mm256_loadu_ps((
float*)points);
434 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
440 _mm256_storeu_ps(target, xmm_result);
452#include <pmmintrin.h>
460 unsigned int num_points)
462 __m128 xmm_points0, xmm_points1, xmm_result;
470 const int quarterPoints = num_points / 4;
471 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 const int leftovers1 = num_points % 2;
475 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
478 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
480 for (
int i = 0;
i < quarterPoints; ++
i) {
481 xmm_points0 = _mm_loadu_ps((
float*)points);
482 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
489 _mm_storeu_ps(target, xmm_result);
493 for (
int i = 0;
i < leftovers0; ++
i) {
494 xmm_points0 = _mm_loadu_ps((
float*)points);
497 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
498 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
499 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
500 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
502 _mm_storeh_pi((__m64*)target, xmm_result);
513#include <xmmintrin.h>
519 unsigned int num_points)
521 const __m128 xmm_scalar = _mm_set1_ps(scalar);
522 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
524 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
525 __m128 xmm_points0 = _mm_loadu_ps((
float*)points);
526 __m128 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
530 _mm_storeu_ps((
float*)target, xmm_result);
539#include <riscv_vector.h>
542volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(
float* target,
546 unsigned int num_points)
548 size_t vlmax = __riscv_vsetvlmax_e32m4();
549 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
550 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
551 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
553 size_t n = num_points;
554 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
555 vl = __riscv_vsetvl_e32m4(n);
556 vuint64m8_t vb = __riscv_vle64_v_u64m8((
const uint64_t*)points, vl);
557 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
558 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
559 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
560 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
561 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
562 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
568#include <riscv_vector.h>
571volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(
float* target,
575 unsigned int num_points)
577 size_t vlmax = __riscv_vsetvlmax_e32m4();
578 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
579 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
580 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
582 size_t n = num_points;
583 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
584 vl = __riscv_vsetvl_e32m4(n);
585 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)points, vl);
586 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
587 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
588 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
589 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
590 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
591 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:456
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:305
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:180
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:515
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:280
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:72
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:221
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, float scalar, unsigned int num_points)
Definition volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx2_intrinsics.h:107
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition volk_avx_intrinsics.h:113
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition volk_sse3_intrinsics.h:50
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition volk_sse_intrinsics.h:74