Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58#define INCLUDED_volk_32fc_index_min_32u_a_H
59
60#include <inttypes.h>
61#include <stdio.h>
62#include <volk/volk_common.h>
63#include <volk/volk_complex.h>
64
65#ifdef LV_HAVE_AVX2
66#include <immintrin.h>
68
69static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
70 const lv_32fc_t* source,
71 uint32_t num_points)
72{
73 const __m256i indices_increment = _mm256_set1_epi32(8);
74 /*
75 * At the start of each loop iteration current_indices holds the indices of
76 * the complex numbers loaded from memory. Explanation for odd order is given
77 * in implementation of vector_32fc_index_min_variant0().
78 */
79 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80
81 __m256 min_values = _mm256_set1_ps(FLT_MAX);
82 __m256i min_indices = _mm256_setzero_si256();
83
84 for (unsigned i = 0; i < num_points / 8u; ++i) {
85 __m256 in0 = _mm256_load_ps((float*)source);
86 __m256 in1 = _mm256_load_ps((float*)(source + 4));
88 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
89 source += 8;
90 }
91
92 // determine minimum value and index in the result of the vectorized loop
93 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
94 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
95 _mm256_store_ps(min_values_buffer, min_values);
96 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
97
98 float min = FLT_MAX;
99 uint32_t index = 0;
100 for (unsigned i = 0; i < 8; i++) {
101 if (min_values_buffer[i] < min) {
102 min = min_values_buffer[i];
103 index = min_indices_buffer[i];
104 }
105 }
106
107 // handle tail not processed by the vectorized loop
108 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109 const float abs_squared =
110 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
111 if (abs_squared < min) {
112 min = abs_squared;
113 index = i;
114 }
115 ++source;
116 }
117
118 *target = index;
119}
120
121#endif /*LV_HAVE_AVX2*/
122
123#ifdef LV_HAVE_AVX2
124#include <immintrin.h>
126
127static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
128 const lv_32fc_t* source,
129 uint32_t num_points)
130{
131 const __m256i indices_increment = _mm256_set1_epi32(8);
132 /*
133 * At the start of each loop iteration current_indices holds the indices of
134 * the complex numbers loaded from memory. Explanation for odd order is given
135 * in implementation of vector_32fc_index_min_variant0().
136 */
137 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138
139 __m256 min_values = _mm256_set1_ps(FLT_MAX);
140 __m256i min_indices = _mm256_setzero_si256();
141
142 for (unsigned i = 0; i < num_points / 8u; ++i) {
143 __m256 in0 = _mm256_load_ps((float*)source);
144 __m256 in1 = _mm256_load_ps((float*)(source + 4));
146 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
147 source += 8;
148 }
149
150 // determine minimum value and index in the result of the vectorized loop
151 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
152 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
153 _mm256_store_ps(min_values_buffer, min_values);
154 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
155
156 float min = FLT_MAX;
157 uint32_t index = 0;
158 for (unsigned i = 0; i < 8; i++) {
159 if (min_values_buffer[i] < min) {
160 min = min_values_buffer[i];
161 index = min_indices_buffer[i];
162 }
163 }
164
165 // handle tail not processed by the vectorized loop
166 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167 const float abs_squared =
168 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
169 if (abs_squared < min) {
170 min = abs_squared;
171 index = i;
172 }
173 ++source;
174 }
175
176 *target = index;
177}
178
179#endif /*LV_HAVE_AVX2*/
180
181#ifdef LV_HAVE_SSE3
182#include <pmmintrin.h>
183#include <xmmintrin.h>
184
185static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
186 const lv_32fc_t* source,
187 uint32_t num_points)
188{
189 union bit128 holderf;
190 union bit128 holderi;
191 float sq_dist = 0.0;
192
193 union bit128 xmm5, xmm4;
194 __m128 xmm1, xmm2, xmm3;
195 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
196
197 xmm5.int_vec = _mm_setzero_si128();
198 xmm4.int_vec = _mm_setzero_si128();
199 holderf.int_vec = _mm_setzero_si128();
200 holderi.int_vec = _mm_setzero_si128();
201
202 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203 xmm9 = _mm_setzero_si128();
204 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205 xmm3 = _mm_set_ps1(FLT_MAX);
206
207 int bound = num_points >> 2;
208
209 for (int i = 0; i < bound; ++i) {
210 xmm1 = _mm_load_ps((float*)source);
211 xmm2 = _mm_load_ps((float*)&source[2]);
212
213 source += 4;
214
215 xmm1 = _mm_mul_ps(xmm1, xmm1);
216 xmm2 = _mm_mul_ps(xmm2, xmm2);
217
218 xmm1 = _mm_hadd_ps(xmm1, xmm2);
219
220 xmm3 = _mm_min_ps(xmm1, xmm3);
221
222 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
224
225 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
226 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
227
228 xmm9 = _mm_add_epi32(xmm11, xmm12);
229
230 xmm8 = _mm_add_epi32(xmm8, xmm10);
231 }
232
233 if (num_points >> 1 & 1) {
234 xmm2 = _mm_load_ps((float*)source);
235
236 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
237 xmm8 = bit128_p(&xmm1)->int_vec;
238
239 xmm2 = _mm_mul_ps(xmm2, xmm2);
240
241 source += 2;
242
243 xmm1 = _mm_hadd_ps(xmm2, xmm2);
244
245 xmm3 = _mm_min_ps(xmm1, xmm3);
246
247 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
248
249 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251
252 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254
255 xmm9 = _mm_add_epi32(xmm11, xmm12);
256
257 xmm8 = _mm_add_epi32(xmm8, xmm10);
258 }
259
260 if (num_points & 1) {
261 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
262 lv_cimag(source[0]) * lv_cimag(source[0]);
263
264 xmm2 = _mm_load1_ps(&sq_dist);
265
266 xmm1 = xmm3;
267
268 xmm3 = _mm_min_ss(xmm3, xmm2);
269
270 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
272
273 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
274
275 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
276 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
277
278 xmm9 = _mm_add_epi32(xmm11, xmm12);
279 }
280
281 _mm_store_ps((float*)&(holderf.f), xmm3);
282 _mm_store_si128(&(holderi.int_vec), xmm9);
283
284 target[0] = holderi.i[0];
285 sq_dist = holderf.f[0];
286 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
287 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
288 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
289 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
290 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
291 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
292}
293
294#endif /*LV_HAVE_SSE3*/
295
296#ifdef LV_HAVE_GENERIC
297static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
298 const lv_32fc_t* source,
299 uint32_t num_points)
300{
301 float sq_dist = 0.0;
302 float min = FLT_MAX;
303 uint32_t index = 0;
304
305 for (uint32_t i = 0; i < num_points; ++i) {
306 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
307 lv_cimag(source[i]) * lv_cimag(source[i]);
308
309 if (sq_dist < min) {
310 index = i;
311 min = sq_dist;
312 }
313 }
314 target[0] = index;
315}
316
317#endif /*LV_HAVE_GENERIC*/
318
319#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
320
321#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
322#define INCLUDED_volk_32fc_index_min_32u_u_H
323
324#include <inttypes.h>
325#include <stdio.h>
326#include <volk/volk_common.h>
327#include <volk/volk_complex.h>
328
329#ifdef LV_HAVE_AVX2
330#include <immintrin.h>
332
333static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
334 const lv_32fc_t* source,
335 uint32_t num_points)
336{
337 const __m256i indices_increment = _mm256_set1_epi32(8);
338 /*
339 * At the start of each loop iteration current_indices holds the indices of
340 * the complex numbers loaded from memory. Explanation for odd order is given
341 * in implementation of vector_32fc_index_min_variant0().
342 */
343 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
344
345 __m256 min_values = _mm256_set1_ps(FLT_MAX);
346 __m256i min_indices = _mm256_setzero_si256();
347
348 for (unsigned i = 0; i < num_points / 8u; ++i) {
349 __m256 in0 = _mm256_loadu_ps((float*)source);
350 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
352 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
353 source += 8;
354 }
355
356 // determine minimum value and index in the result of the vectorized loop
357 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
358 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
359 _mm256_store_ps(min_values_buffer, min_values);
360 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
361
362 float min = FLT_MAX;
363 uint32_t index = 0;
364 for (unsigned i = 0; i < 8; i++) {
365 if (min_values_buffer[i] < min) {
366 min = min_values_buffer[i];
367 index = min_indices_buffer[i];
368 }
369 }
370
371 // handle tail not processed by the vectorized loop
372 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
373 const float abs_squared =
374 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
375 if (abs_squared < min) {
376 min = abs_squared;
377 index = i;
378 }
379 ++source;
380 }
381
382 *target = index;
383}
384
385#endif /*LV_HAVE_AVX2*/
386
387#ifdef LV_HAVE_AVX2
388#include <immintrin.h>
390
391static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
392 const lv_32fc_t* source,
393 uint32_t num_points)
394{
395 const __m256i indices_increment = _mm256_set1_epi32(8);
396 /*
397 * At the start of each loop iteration current_indices holds the indices of
398 * the complex numbers loaded from memory. Explanation for odd order is given
399 * in implementation of vector_32fc_index_min_variant0().
400 */
401 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
402
403 __m256 min_values = _mm256_set1_ps(FLT_MAX);
404 __m256i min_indices = _mm256_setzero_si256();
405
406 for (unsigned i = 0; i < num_points / 8u; ++i) {
407 __m256 in0 = _mm256_loadu_ps((float*)source);
408 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
410 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
411 source += 8;
412 }
413
414 // determine minimum value and index in the result of the vectorized loop
415 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
416 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
417 _mm256_store_ps(min_values_buffer, min_values);
418 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
419
420 float min = FLT_MAX;
421 uint32_t index = 0;
422 for (unsigned i = 0; i < 8; i++) {
423 if (min_values_buffer[i] < min) {
424 min = min_values_buffer[i];
425 index = min_indices_buffer[i];
426 }
427 }
428
429 // handle tail not processed by the vectorized loop
430 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
431 const float abs_squared =
432 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
433 if (abs_squared < min) {
434 min = abs_squared;
435 index = i;
436 }
437 ++source;
438 }
439
440 *target = index;
441}
442
443#endif /*LV_HAVE_AVX2*/
444
445#ifdef LV_HAVE_NEON
446#include <arm_neon.h>
448
449static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
450 const lv_32fc_t* source,
451 uint32_t num_points)
452{
453 const uint32_t quarter_points = num_points / 4;
454 const lv_32fc_t* sourcePtr = source;
455
456 uint32_t indices[4] = { 0, 1, 2, 3 };
457 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
458 uint32x4_t vec_indices = vld1q_u32(indices);
459 uint32x4_t vec_min_indices = vec_indices;
460
461 if (num_points) {
462 float min = FLT_MAX;
463 uint32_t index = 0;
464
465 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
466
467 for (uint32_t number = 0; number < quarter_points; number++) {
468 // Load complex and compute magnitude squared
469 const float32x4_t vec_mag2 =
470 _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
471 __VOLK_PREFETCH(sourcePtr += 4);
472 // a < b?
473 const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
474 vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
475 vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
476 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
477 }
478 uint32_t tmp_min_indices[4];
479 float tmp_min[4];
480 vst1q_u32(tmp_min_indices, vec_min_indices);
481 vst1q_f32(tmp_min, vec_min);
482
483 for (int i = 0; i < 4; i++) {
484 if (tmp_min[i] < min) {
485 min = tmp_min[i];
486 index = tmp_min_indices[i];
487 }
488 }
489
490 // Deal with the rest
491 for (uint32_t number = quarter_points * 4; number < num_points; number++) {
492 const float re = lv_creal(*sourcePtr);
493 const float im = lv_cimag(*sourcePtr);
494 const float sq_dist = re * re + im * im;
495 if (sq_dist < min) {
496 min = sq_dist;
497 index = number;
498 }
499 sourcePtr++;
500 }
501 *target = index;
502 }
503}
504
505#endif /*LV_HAVE_NEON*/
506
507#ifdef LV_HAVE_RVV
508#include <float.h>
509#include <riscv_vector.h>
510
511static inline void volk_32fc_index_min_32u_rvv(uint32_t* target,
512 const lv_32fc_t* source,
513 uint32_t num_points)
514{
515 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
516 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
517 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
518 size_t n = num_points;
519 for (size_t vl; n > 0; n -= vl, source += vl) {
520 vl = __riscv_vsetvl_e32m4(n);
521 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
522 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
523 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
524 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
525 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
526 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
527 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
528 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
529 }
530 size_t vl = __riscv_vsetvlmax_e32m4();
531 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
532 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
533 __riscv_vsetvlmax_e32m1()));
534 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
535 *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
536}
537#endif /*LV_HAVE_RVV*/
538
539#ifdef LV_HAVE_RVVSEG
540#include <float.h>
541#include <riscv_vector.h>
542
543static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target,
544 const lv_32fc_t* source,
545 uint32_t num_points)
546{
547 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
548 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
549 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
550 size_t n = num_points;
551 for (size_t vl; n > 0; n -= vl, source += vl) {
552 vl = __riscv_vsetvl_e32m4(n);
553 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
554 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
555 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
556 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
557 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
558 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
559 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
560 }
561 size_t vl = __riscv_vsetvlmax_e32m4();
562 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
563 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
564 __riscv_vsetvlmax_e32m1()));
565 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
566 *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
567}
568#endif /*LV_HAVE_RVVSEG*/
569
570#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
Definition volk_common.h:116
float f[4]
Definition volk_common.h:120
__m128i int_vec
Definition volk_common.h:128
uint32_t i[4]
Definition volk_common.h:119
__m128 float_vec
Definition volk_common.h:124
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition volk_32fc_index_min_32u.h:297
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition volk_32fc_index_min_32u.h:185
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition volk_32fc_index_min_32u.h:449
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:253
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:315
#define bit128_p(x)
Definition volk_common.h:147
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition volk_neon_intrinsics.h:73
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24