65#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
75static inline void volk_32fc_x2_square_dist_32f_a_avx2(
float* target,
78 unsigned int num_points)
80 const unsigned int num_bytes = num_points * 8;
81 __m128 xmm0, xmm9, xmm10;
82 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
86 int bound = num_bytes >> 6;
87 int leftovers0 = (num_bytes >> 5) & 1;
88 int leftovers1 = (num_bytes >> 4) & 1;
89 int leftovers2 = (num_bytes >> 3) & 1;
92 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 xmm1 = _mm256_setzero_ps();
94 xmm0 = _mm_load_ps((
float*)src0);
95 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
99 for (;
i < bound; ++
i) {
100 xmm2 = _mm256_load_ps((
float*)&points[0]);
101 xmm3 = _mm256_load_ps((
float*)&points[4]);
104 xmm4 = _mm256_sub_ps(xmm1, xmm2);
105 xmm5 = _mm256_sub_ps(xmm1, xmm3);
106 xmm6 = _mm256_mul_ps(xmm4, xmm4);
107 xmm7 = _mm256_mul_ps(xmm5, xmm5);
109 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
112 _mm256_store_ps(target, xmm4);
117 for (
i = 0;
i < leftovers0; ++
i) {
119 xmm2 = _mm256_load_ps((
float*)&points[0]);
121 xmm4 = _mm256_sub_ps(xmm1, xmm2);
125 xmm6 = _mm256_mul_ps(xmm4, xmm4);
127 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
130 xmm9 = _mm256_extractf128_ps(xmm4, 1);
131 _mm_store_ps(target, xmm9);
136 for (
i = 0;
i < leftovers1; ++
i) {
137 xmm9 = _mm_load_ps((
float*)&points[0]);
139 xmm10 = _mm_sub_ps(xmm0, xmm9);
143 xmm9 = _mm_mul_ps(xmm10, xmm10);
145 xmm10 = _mm_hadd_ps(xmm9, xmm9);
147 _mm_storeh_pi((__m64*)target, xmm10);
152 for (
i = 0;
i < leftovers2; ++
i) {
154 diff = src0[0] - points[0];
165#include <pmmintrin.h>
166#include <xmmintrin.h>
171 unsigned int num_points)
173 const unsigned int num_bytes = num_points * 8;
175 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
179 int bound = num_bytes >> 5;
182 xmm1 = _mm_setzero_ps();
183 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184 xmm1 = _mm_movelh_ps(xmm1, xmm1);
186 for (;
i < bound; ++
i) {
187 xmm2 = _mm_load_ps((
float*)&points[0]);
188 xmm4 = _mm_sub_ps(xmm1, xmm2);
189 xmm3 = _mm_load_ps((
float*)&points[2]);
190 xmm5 = _mm_sub_ps(xmm1, xmm3);
192 xmm6 = _mm_mul_ps(xmm4, xmm4);
193 xmm7 = _mm_mul_ps(xmm5, xmm5);
195 xmm4 = _mm_hadd_ps(xmm6, xmm7);
197 _mm_store_ps(target, xmm4);
203 if (num_bytes >> 4 & 1) {
205 xmm2 = _mm_load_ps((
float*)&points[0]);
207 xmm4 = _mm_sub_ps(xmm1, xmm2);
211 xmm6 = _mm_mul_ps(xmm4, xmm4);
213 xmm4 = _mm_hadd_ps(xmm6, xmm6);
215 _mm_storeh_pi((__m64*)target, xmm4);
220 if (num_bytes >> 3 & 1) {
222 diff = src0[0] - points[0];
238 unsigned int num_points)
240 const unsigned int quarter_points = num_points / 4;
243 float32x4x2_t a_vec, b_vec;
244 float32x4x2_t diff_vec;
245 float32x4_t tmp, tmp1, dist_sq;
246 a_vec.val[0] = vdupq_n_f32(
lv_creal(src0[0]));
247 a_vec.val[1] = vdupq_n_f32(
lv_cimag(src0[0]));
248 for (number = 0; number < quarter_points; ++number) {
249 b_vec = vld2q_f32((
float*)points);
250 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
251 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
252 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
253 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
255 dist_sq = vaddq_f32(tmp, tmp1);
256 vst1q_f32(target, dist_sq);
260 for (number = quarter_points * 4; number < num_points; ++number) {
268#ifdef LV_HAVE_GENERIC
272 unsigned int num_points)
274 const unsigned int num_bytes = num_points * 8;
280 for (;
i < (num_bytes >> 3); ++
i) {
281 diff = src0[0] - points[
i];
294#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
295#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
302#include <immintrin.h>
304static inline void volk_32fc_x2_square_dist_32f_u_avx2(
float* target,
307 unsigned int num_points)
309 const unsigned int num_bytes = num_points * 8;
311 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
315 int bound = num_bytes >> 6;
316 int leftovers1 = (num_bytes >> 3) & 0b11;
319 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
320 xmm1 = _mm256_setzero_ps();
321 xmm0 = _mm_loadu_ps((
float*)src0);
322 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
323 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
324 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
326 for (;
i < bound; ++
i) {
327 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
328 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
331 xmm4 = _mm256_sub_ps(xmm1, xmm2);
332 xmm5 = _mm256_sub_ps(xmm1, xmm3);
333 xmm6 = _mm256_mul_ps(xmm4, xmm4);
334 xmm7 = _mm256_mul_ps(xmm5, xmm5);
336 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
337 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
339 _mm256_storeu_ps(target, xmm4);
344 if (num_bytes >> 5 & 1) {
346 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
348 xmm4 = _mm256_sub_ps(xmm1, xmm2);
352 xmm6 = _mm256_mul_ps(xmm4, xmm4);
354 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
355 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
357 xmm9 = _mm256_extractf128_ps(xmm4, 1);
358 _mm_storeu_ps(target, xmm9);
363 for (
i = 0;
i < leftovers1; ++
i) {
365 diff = src0[0] - points[0];
378#include <riscv_vector.h>
380static inline void volk_32fc_x2_square_dist_32f_rvv(
float* target,
383 unsigned int num_points)
385 size_t vlmax = __riscv_vsetvlmax_e32m4();
386 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
387 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
389 size_t n = num_points;
390 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
391 vl = __riscv_vsetvl_e32m4(n);
392 vuint64m8_t vb = __riscv_vle64_v_u64m8((
const uint64_t*)points, vl);
393 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
394 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
395 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
396 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
397 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
398 __riscv_vse32(target, v, vl);
404#include <riscv_vector.h>
406static inline void volk_32fc_x2_square_dist_32f_rvvseg(
float* target,
409 unsigned int num_points)
411 size_t vlmax = __riscv_vsetvlmax_e32m4();
412 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
413 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
415 size_t n = num_points;
416 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
417 vl = __riscv_vsetvl_e32m4(n);
418 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)points, vl);
419 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
420 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
421 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
422 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
423 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
424 __riscv_vse32(target, v, vl);
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition volk_32fc_x2_square_dist_32f.h:168
static void volk_32fc_x2_square_dist_32f_neon(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition volk_32fc_x2_square_dist_32f.h:235
static void volk_32fc_x2_square_dist_32f_generic(float *target, const lv_32fc_t *src0, const lv_32fc_t *points, unsigned int num_points)
Definition volk_32fc_x2_square_dist_32f.h:269
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13