Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
62
63#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64#define INCLUDED_volk_32fc_index_min_16u_a_H
65
66#include <inttypes.h>
67#include <limits.h>
68#include <stdio.h>
69#include <volk/volk_common.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
75
76static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77 const lv_32fc_t* source,
78 uint32_t num_points)
79{
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_min_variant0().
87 */
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 __m256 min_values = _mm256_set1_ps(FLT_MAX);
91 __m256i min_indices = _mm256_setzero_si256();
92
93 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((float*)source);
95 __m256 in1 = _mm256_load_ps((float*)(source + 4));
97 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98 source += 8;
99 }
100
101 // determine minimum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104 _mm256_store_ps(min_values_buffer, min_values);
105 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106
107 float min = FLT_MAX;
108 uint32_t index = 0;
109 for (unsigned i = 0; i < 8; i++) {
110 if (min_values_buffer[i] < min) {
111 min = min_values_buffer[i];
112 index = min_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
119 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
120 if (abs_squared < min) {
121 min = abs_squared;
122 index = i;
123 }
124 ++source;
125 }
126
127 *target = index;
128}
129
130#endif /*LV_HAVE_AVX2*/
131
132#ifdef LV_HAVE_AVX2
133#include <immintrin.h>
135
136static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137 const lv_32fc_t* source,
138 uint32_t num_points)
139{
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_min_variant0().
147 */
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 __m256 min_values = _mm256_set1_ps(FLT_MAX);
151 __m256i min_indices = _mm256_setzero_si256();
152
153 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((float*)source);
155 __m256 in1 = _mm256_load_ps((float*)(source + 4));
157 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158 source += 8;
159 }
160
161 // determine minimum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164 _mm256_store_ps(min_values_buffer, min_values);
165 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166
167 float min = FLT_MAX;
168 uint32_t index = 0;
169 for (unsigned i = 0; i < 8; i++) {
170 if (min_values_buffer[i] < min) {
171 min = min_values_buffer[i];
172 index = min_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
179 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
180 if (abs_squared < min) {
181 min = abs_squared;
182 index = i;
183 }
184 ++source;
185 }
186
187 *target = index;
188}
189
190#endif /*LV_HAVE_AVX2*/
191
192#ifdef LV_HAVE_SSE3
193#include <pmmintrin.h>
194#include <xmmintrin.h>
195
196static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197 const lv_32fc_t* source,
198 uint32_t num_points)
199{
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201
202 union bit128 holderf;
203 union bit128 holderi;
204 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 xmm5.int_vec = _mm_setzero_si128();
211 xmm4.int_vec = _mm_setzero_si128();
212 holderf.int_vec = _mm_setzero_si128();
213 holderi.int_vec = _mm_setzero_si128();
214
215 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 xmm9 = _mm_setzero_si128();
217 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 xmm3 = _mm_set_ps1(FLT_MAX);
219
220 int bound = num_points >> 2;
221
222 for (int i = 0; i < bound; ++i) {
223 xmm1 = _mm_load_ps((float*)source);
224 xmm2 = _mm_load_ps((float*)&source[2]);
225
226 source += 4;
227
228 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 xmm2 = _mm_mul_ps(xmm2, xmm2);
230
231 xmm1 = _mm_hadd_ps(xmm1, xmm2);
232
233 xmm3 = _mm_min_ps(xmm1, xmm3);
234
235 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237
238 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240
241 xmm9 = _mm_add_epi32(xmm11, xmm12);
242
243 xmm8 = _mm_add_epi32(xmm8, xmm10);
244 }
245
246 if (num_points >> 1 & 1) {
247 xmm2 = _mm_load_ps((float*)source);
248
249 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250 xmm8 = bit128_p(&xmm1)->int_vec;
251
252 xmm2 = _mm_mul_ps(xmm2, xmm2);
253
254 source += 2;
255
256 xmm1 = _mm_hadd_ps(xmm2, xmm2);
257
258 xmm3 = _mm_min_ps(xmm1, xmm3);
259
260 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261
262 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264
265 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267
268 xmm9 = _mm_add_epi32(xmm11, xmm12);
269
270 xmm8 = _mm_add_epi32(xmm8, xmm10);
271 }
272
273 if (num_points & 1) {
274 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275 lv_cimag(source[0]) * lv_cimag(source[0]);
276
277 xmm2 = _mm_load1_ps(&sq_dist);
278
279 xmm1 = xmm3;
280
281 xmm3 = _mm_min_ss(xmm3, xmm2);
282
283 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285
286 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287
288 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
292 }
293
294 _mm_store_ps((float*)&(holderf.f), xmm3);
295 _mm_store_si128(&(holderi.int_vec), xmm9);
296
297 target[0] = holderi.i[0];
298 sq_dist = holderf.f[0];
299 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305}
306
307#endif /*LV_HAVE_SSE3*/
308
309#ifdef LV_HAVE_GENERIC
310static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
311 const lv_32fc_t* source,
312 uint32_t num_points)
313{
314 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315
316 float sq_dist = 0.0;
317 float min = FLT_MAX;
318 uint16_t index = 0;
319
320 for (uint32_t i = 0; i < num_points; ++i) {
321 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
322 lv_cimag(source[i]) * lv_cimag(source[i]);
323
324 if (sq_dist < min) {
325 index = i;
326 min = sq_dist;
327 }
328 }
329 target[0] = index;
330}
331
332#endif /*LV_HAVE_GENERIC*/
333
334#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
335
336#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
337#define INCLUDED_volk_32fc_index_min_16u_u_H
338
339#include <inttypes.h>
340#include <limits.h>
341#include <stdio.h>
342#include <volk/volk_common.h>
343#include <volk/volk_complex.h>
344
345#ifdef LV_HAVE_AVX2
346#include <immintrin.h>
348
349static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
350 const lv_32fc_t* source,
351 uint32_t num_points)
352{
353 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
354
355 const __m256i indices_increment = _mm256_set1_epi32(8);
356 /*
357 * At the start of each loop iteration current_indices holds the indices of
358 * the complex numbers loaded from memory. Explanation for odd order is given
359 * in implementation of vector_32fc_index_min_variant0().
360 */
361 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362
363 __m256 min_values = _mm256_set1_ps(FLT_MAX);
364 __m256i min_indices = _mm256_setzero_si256();
365
366 for (unsigned i = 0; i < num_points / 8u; ++i) {
367 __m256 in0 = _mm256_loadu_ps((float*)source);
368 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
370 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
371 source += 8;
372 }
373
374 // determine minimum value and index in the result of the vectorized loop
375 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
376 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
377 _mm256_store_ps(min_values_buffer, min_values);
378 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
379
380 float min = FLT_MAX;
381 uint32_t index = 0;
382 for (unsigned i = 0; i < 8; i++) {
383 if (min_values_buffer[i] < min) {
384 min = min_values_buffer[i];
385 index = min_indices_buffer[i];
386 }
387 }
388
389 // handle tail not processed by the vectorized loop
390 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391 const float abs_squared =
392 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
393 if (abs_squared < min) {
394 min = abs_squared;
395 index = i;
396 }
397 ++source;
398 }
399
400 *target = index;
401}
402
403#endif /*LV_HAVE_AVX2*/
404
405#ifdef LV_HAVE_AVX2
406#include <immintrin.h>
408
409static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
410 const lv_32fc_t* source,
411 uint32_t num_points)
412{
413 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
414
415 const __m256i indices_increment = _mm256_set1_epi32(8);
416 /*
417 * At the start of each loop iteration current_indices holds the indices of
418 * the complex numbers loaded from memory. Explanation for odd order is given
419 * in implementation of vector_32fc_index_min_variant0().
420 */
421 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
422
423 __m256 min_values = _mm256_set1_ps(FLT_MAX);
424 __m256i min_indices = _mm256_setzero_si256();
425
426 for (unsigned i = 0; i < num_points / 8u; ++i) {
427 __m256 in0 = _mm256_loadu_ps((float*)source);
428 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
430 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
431 source += 8;
432 }
433
434 // determine minimum value and index in the result of the vectorized loop
435 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
436 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
437 _mm256_store_ps(min_values_buffer, min_values);
438 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
439
440 float min = FLT_MAX;
441 uint32_t index = 0;
442 for (unsigned i = 0; i < 8; i++) {
443 if (min_values_buffer[i] < min) {
444 min = min_values_buffer[i];
445 index = min_indices_buffer[i];
446 }
447 }
448
449 // handle tail not processed by the vectorized loop
450 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
451 const float abs_squared =
452 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
453 if (abs_squared < min) {
454 min = abs_squared;
455 index = i;
456 }
457 ++source;
458 }
459
460 *target = index;
461}
462
463#endif /*LV_HAVE_AVX2*/
464
465#ifdef LV_HAVE_RVV
466#include <float.h>
467#include <riscv_vector.h>
468
469static inline void volk_32fc_index_min_16u_rvv(uint16_t* target,
470 const lv_32fc_t* source,
471 uint32_t num_points)
472{
473 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
474 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
475 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
476 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
477 for (size_t vl; n > 0; n -= vl, source += vl) {
478 vl = __riscv_vsetvl_e32m4(n);
479 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
480 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
481 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
482 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
483 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
484 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
485 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
486 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
487 }
488 size_t vl = __riscv_vsetvlmax_e32m4();
489 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
490 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
491 __riscv_vsetvlmax_e32m1()));
492 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
493 *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
494}
495#endif /*LV_HAVE_RVV*/
496
497#ifdef LV_HAVE_RVVSEG
498#include <float.h>
499#include <riscv_vector.h>
500
501static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target,
502 const lv_32fc_t* source,
503 uint32_t num_points)
504{
505 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
506 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
507 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
508 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
509 for (size_t vl; n > 0; n -= vl, source += vl) {
510 vl = __riscv_vsetvl_e32m4(n);
511 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
512 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
513 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
514 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
515 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
516 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
517 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
518 }
519 size_t vl = __riscv_vsetvlmax_e32m4();
520 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
521 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
522 __riscv_vsetvlmax_e32m1()));
523 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
524 *target = __riscv_vmv_x(__riscv_vslidedown(vmini, __riscv_vfirst(m, vl), vl));
525}
526#endif /*LV_HAVE_RVVSEG*/
527
528#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
Definition volk_common.h:116
float f[4]
Definition volk_common.h:120
__m128i int_vec
Definition volk_common.h:128
uint32_t i[4]
Definition volk_common.h:119
__m128 float_vec
Definition volk_common.h:124
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition volk_32fc_index_min_16u.h:310
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition volk_32fc_index_min_16u.h:196
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:253
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:315
#define bit128_p(x)
Definition volk_common.h:147
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24