57#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58#define INCLUDED_volk_32fc_index_max_32u_a_H
68static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
72 const __m256i indices_increment = _mm256_set1_epi32(8);
78 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80 __m256 max_values = _mm256_setzero_ps();
81 __m256i max_indices = _mm256_setzero_si256();
83 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
84 __m256 in0 = _mm256_load_ps((
float*)src0);
85 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
87 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
94 _mm256_store_ps(max_values_buffer, max_values);
95 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
99 for (
unsigned i = 0;
i < 8;
i++) {
100 if (max_values_buffer[
i] > max) {
101 max = max_values_buffer[
i];
102 index = max_indices_buffer[
i];
107 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
108 const float abs_squared =
110 if (abs_squared > max) {
123#include <immintrin.h>
126static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
130 const __m256i indices_increment = _mm256_set1_epi32(8);
136 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138 __m256 max_values = _mm256_setzero_ps();
139 __m256i max_indices = _mm256_setzero_si256();
141 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
142 __m256 in0 = _mm256_load_ps((
float*)src0);
143 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
145 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
152 _mm256_store_ps(max_values_buffer, max_values);
153 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
157 for (
unsigned i = 0;
i < 8;
i++) {
158 if (max_values_buffer[
i] > max) {
159 max = max_values_buffer[
i];
160 index = max_indices_buffer[
i];
165 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
166 const float abs_squared =
168 if (abs_squared > max) {
181#include <pmmintrin.h>
182#include <xmmintrin.h>
188 const uint32_t num_bytes = num_points * 8;
195 __m128 xmm1, xmm2, xmm3;
196 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
198 xmm5.
int_vec = _mm_setzero_si128();
199 xmm4.
int_vec = _mm_setzero_si128();
200 holderf.
int_vec = _mm_setzero_si128();
201 holderi.
int_vec = _mm_setzero_si128();
203 int bound = num_bytes >> 5;
206 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207 xmm9 = _mm_setzero_si128();
208 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209 xmm3 = _mm_setzero_ps();
211 for (;
i < bound; ++
i) {
212 xmm1 = _mm_load_ps((
float*)src0);
213 xmm2 = _mm_load_ps((
float*)&src0[2]);
217 xmm1 = _mm_mul_ps(xmm1, xmm1);
218 xmm2 = _mm_mul_ps(xmm2, xmm2);
220 xmm1 = _mm_hadd_ps(xmm1, xmm2);
222 xmm3 = _mm_max_ps(xmm1, xmm3);
224 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
225 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
227 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
228 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
230 xmm9 = _mm_add_epi32(xmm11, xmm12);
232 xmm8 = _mm_add_epi32(xmm8, xmm10);
235 if (num_bytes >> 4 & 1) {
236 xmm2 = _mm_load_ps((
float*)src0);
241 xmm2 = _mm_mul_ps(xmm2, xmm2);
245 xmm1 = _mm_hadd_ps(xmm2, xmm2);
247 xmm3 = _mm_max_ps(xmm1, xmm3);
249 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
251 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
252 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
254 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
255 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
257 xmm9 = _mm_add_epi32(xmm11, xmm12);
259 xmm8 = _mm_add_epi32(xmm8, xmm10);
262 if (num_bytes >> 3 & 1) {
266 xmm2 = _mm_load1_ps(&sq_dist);
270 xmm3 = _mm_max_ss(xmm3, xmm2);
272 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
273 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
275 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
277 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
278 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
280 xmm9 = _mm_add_epi32(xmm11, xmm12);
283 _mm_store_ps((
float*)&(holderf.
f), xmm3);
284 _mm_store_si128(&(holderi.
int_vec), xmm9);
286 target[0] = holderi.
i[0];
287 sq_dist = holderf.
f[0];
288 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
289 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
290 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
291 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
292 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
293 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
298#ifdef LV_HAVE_GENERIC
303 const uint32_t num_bytes = num_points * 8;
311 for (;
i < (num_bytes >> 3); ++
i) {
327#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
328#define INCLUDED_volk_32fc_index_max_32u_u_H
335#include <immintrin.h>
338static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
342 const __m256i indices_increment = _mm256_set1_epi32(8);
348 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
350 __m256 max_values = _mm256_setzero_ps();
351 __m256i max_indices = _mm256_setzero_si256();
353 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
354 __m256 in0 = _mm256_loadu_ps((
float*)src0);
355 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
357 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
364 _mm256_store_ps(max_values_buffer, max_values);
365 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
369 for (
unsigned i = 0;
i < 8;
i++) {
370 if (max_values_buffer[
i] > max) {
371 max = max_values_buffer[
i];
372 index = max_indices_buffer[
i];
377 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
378 const float abs_squared =
380 if (abs_squared > max) {
393#include <immintrin.h>
396static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
400 const __m256i indices_increment = _mm256_set1_epi32(8);
406 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
408 __m256 max_values = _mm256_setzero_ps();
409 __m256i max_indices = _mm256_setzero_si256();
411 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
412 __m256 in0 = _mm256_loadu_ps((
float*)src0);
413 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
415 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
422 _mm256_store_ps(max_values_buffer, max_values);
423 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
427 for (
unsigned i = 0;
i < 8;
i++) {
428 if (max_values_buffer[
i] > max) {
429 max = max_values_buffer[
i];
430 index = max_indices_buffer[
i];
435 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
436 const float abs_squared =
438 if (abs_squared > max) {
457 unsigned int number = 0;
458 const uint32_t quarter_points = num_points / 4;
461 uint32_t indices[4] = { 0, 1, 2, 3 };
462 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
463 uint32x4_t vec_indices = vld1q_u32(indices);
464 uint32x4_t vec_max_indices = vec_indices;
470 float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
472 for (; number < quarter_points; number++) {
474 const float32x4_t vec_mag2 =
478 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
479 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
480 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
481 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
483 uint32_t tmp_max_indices[4];
485 vst1q_u32(tmp_max_indices, vec_max_indices);
486 vst1q_f32(tmp_max, vec_max);
488 for (
int i = 0;
i < 4;
i++) {
489 if (tmp_max[
i] > max) {
491 index = tmp_max_indices[
i];
496 for (number = quarter_points * 4; number < num_points; number++) {
497 const float re =
lv_creal(*src0Ptr);
498 const float im =
lv_cimag(*src0Ptr);
499 const float sq_dist = re * re + im * im;
514#include <riscv_vector.h>
517volk_32fc_index_max_32u_rvv(uint32_t* target,
const lv_32fc_t* src0, uint32_t num_points)
519 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
520 vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
521 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
522 size_t n = num_points;
523 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
524 vl = __riscv_vsetvl_e32m4(n);
525 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)src0, vl);
526 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
527 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
528 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
529 vbool8_t m = __riscv_vmflt(vmax, v, vl);
530 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
531 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
532 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
534 size_t vl = __riscv_vsetvlmax_e32m4();
535 float max = __riscv_vfmv_f(__riscv_vfredmax(
RISCV_SHRINK4(vfmax,
f, 32, vmax),
536 __riscv_vfmv_v_f_f32m1(0, 1),
537 __riscv_vsetvlmax_e32m1()));
538 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
539 *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
545#include <riscv_vector.h>
547static inline void volk_32fc_index_max_32u_rvvseg(uint32_t* target,
551 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
552 vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
553 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
554 size_t n = num_points;
555 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
556 vl = __riscv_vsetvl_e32m4(n);
557 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)src0, vl);
558 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
559 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
560 vbool8_t m = __riscv_vmflt(vmax, v, vl);
561 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
562 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
563 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
565 size_t vl = __riscv_vsetvlmax_e32m4();
566 float max = __riscv_vfmv_f(__riscv_vfredmax(
RISCV_SHRINK4(vfmax,
f, 32, vmax),
567 __riscv_vfmv_v_f_f32m1(0, 1),
568 __riscv_vsetvlmax_e32m1()));
569 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
570 *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
Definition volk_common.h:116
float f[4]
Definition volk_common.h:120
__m128i int_vec
Definition volk_common.h:128
uint32_t i[4]
Definition volk_common.h:119
__m128 float_vec
Definition volk_common.h:124
static void volk_32fc_index_max_32u_generic(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition volk_32fc_index_max_32u.h:299
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition volk_32fc_index_max_32u.h:184
static void volk_32fc_index_max_32u_neon(uint32_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition volk_32fc_index_max_32u.h:455
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:203
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:141
#define bit128_p(x)
Definition volk_common.h:147
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition volk_neon_intrinsics.h:73
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24