Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
62
63#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64#define INCLUDED_volk_32fc_index_max_16u_a_H
65
66#include <inttypes.h>
67#include <limits.h>
68#include <stdio.h>
69#include <volk/volk_common.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
75
76static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77 const lv_32fc_t* src0,
78 uint32_t num_points)
79{
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_max_variant0().
87 */
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 __m256 max_values = _mm256_setzero_ps();
91 __m256i max_indices = _mm256_setzero_si256();
92
93 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((float*)src0);
95 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98 src0 += 8;
99 }
100
101 // determine maximum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104 _mm256_store_ps(max_values_buffer, max_values);
105 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106
107 float max = 0.f;
108 uint32_t index = 0;
109 for (unsigned i = 0; i < 8; i++) {
110 if (max_values_buffer[i] > max) {
111 max = max_values_buffer[i];
112 index = max_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
119 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120 if (abs_squared > max) {
121 max = abs_squared;
122 index = i;
123 }
124 ++src0;
125 }
126
127 *target = index;
128}
129
130#endif /*LV_HAVE_AVX2*/
131
132#ifdef LV_HAVE_AVX2
133#include <immintrin.h>
135
136static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137 const lv_32fc_t* src0,
138 uint32_t num_points)
139{
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_max_variant0().
147 */
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 __m256 max_values = _mm256_setzero_ps();
151 __m256i max_indices = _mm256_setzero_si256();
152
153 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((float*)src0);
155 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158 src0 += 8;
159 }
160
161 // determine maximum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164 _mm256_store_ps(max_values_buffer, max_values);
165 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166
167 float max = 0.f;
168 uint32_t index = 0;
169 for (unsigned i = 0; i < 8; i++) {
170 if (max_values_buffer[i] > max) {
171 max = max_values_buffer[i];
172 index = max_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
179 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180 if (abs_squared > max) {
181 max = abs_squared;
182 index = i;
183 }
184 ++src0;
185 }
186
187 *target = index;
188}
189
190#endif /*LV_HAVE_AVX2*/
191
192#ifdef LV_HAVE_SSE3
193#include <pmmintrin.h>
194#include <xmmintrin.h>
195
196static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target,
197 const lv_32fc_t* src0,
198 uint32_t num_points)
199{
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201 const uint32_t num_bytes = num_points * 8;
202
203 union bit128 holderf;
204 union bit128 holderi;
205 float sq_dist = 0.0;
206
207 union bit128 xmm5, xmm4;
208 __m128 xmm1, xmm2, xmm3;
209 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210
211 xmm5.int_vec = _mm_setzero_si128();
212 xmm4.int_vec = _mm_setzero_si128();
213 holderf.int_vec = _mm_setzero_si128();
214 holderi.int_vec = _mm_setzero_si128();
215
216 int bound = num_bytes >> 5;
217 int i = 0;
218
219 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220 xmm9 = _mm_setzero_si128();
221 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222 xmm3 = _mm_setzero_ps();
223
224 for (; i < bound; ++i) {
225 xmm1 = _mm_load_ps((float*)src0);
226 xmm2 = _mm_load_ps((float*)&src0[2]);
227
228 src0 += 4;
229
230 xmm1 = _mm_mul_ps(xmm1, xmm1);
231 xmm2 = _mm_mul_ps(xmm2, xmm2);
232
233 xmm1 = _mm_hadd_ps(xmm1, xmm2);
234
235 xmm3 = _mm_max_ps(xmm1, xmm3);
236
237 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239
240 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242
243 xmm9 = _mm_add_epi32(xmm11, xmm12);
244
245 xmm8 = _mm_add_epi32(xmm8, xmm10);
246 }
247
248 if (num_bytes >> 4 & 1) {
249 xmm2 = _mm_load_ps((float*)src0);
250
251 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252 xmm8 = bit128_p(&xmm1)->int_vec;
253
254 xmm2 = _mm_mul_ps(xmm2, xmm2);
255
256 src0 += 2;
257
258 xmm1 = _mm_hadd_ps(xmm2, xmm2);
259
260 xmm3 = _mm_max_ps(xmm1, xmm3);
261
262 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263
264 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266
267 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269
270 xmm9 = _mm_add_epi32(xmm11, xmm12);
271
272 xmm8 = _mm_add_epi32(xmm8, xmm10);
273 }
274
275 if (num_bytes >> 3 & 1) {
276 sq_dist =
277 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278
279 xmm2 = _mm_load1_ps(&sq_dist);
280
281 xmm1 = xmm3;
282
283 xmm3 = _mm_max_ss(xmm3, xmm2);
284
285 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287
288 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289
290 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292
293 xmm9 = _mm_add_epi32(xmm11, xmm12);
294 }
295
296 _mm_store_ps((float*)&(holderf.f), xmm3);
297 _mm_store_si128(&(holderi.int_vec), xmm9);
298
299 target[0] = holderi.i[0];
300 sq_dist = holderf.f[0];
301 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307}
308
309#endif /*LV_HAVE_SSE3*/
310
311#ifdef LV_HAVE_GENERIC
312static inline void volk_32fc_index_max_16u_generic(uint16_t* target,
313 const lv_32fc_t* src0,
314 uint32_t num_points)
315{
316 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
317
318 const uint32_t num_bytes = num_points * 8;
319
320 float sq_dist = 0.0;
321 float max = 0.0;
322 uint16_t index = 0;
323
324 uint32_t i = 0;
325
326 for (; i < (num_bytes >> 3); ++i) {
327 sq_dist =
328 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
329
330 if (sq_dist > max) {
331 index = i;
332 max = sq_dist;
333 }
334 }
335 target[0] = index;
336}
337
338#endif /*LV_HAVE_GENERIC*/
339
340#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
341
342#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
343#define INCLUDED_volk_32fc_index_max_16u_u_H
344
345#include <inttypes.h>
346#include <limits.h>
347#include <stdio.h>
348#include <volk/volk_common.h>
349#include <volk/volk_complex.h>
350
351#ifdef LV_HAVE_AVX2
352#include <immintrin.h>
354
355static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
356 const lv_32fc_t* src0,
357 uint32_t num_points)
358{
359 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
360
361 const __m256i indices_increment = _mm256_set1_epi32(8);
362 /*
363 * At the start of each loop iteration current_indices holds the indices of
364 * the complex numbers loaded from memory. Explanation for odd order is given
365 * in implementation of vector_32fc_index_max_variant0().
366 */
367 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
368
369 __m256 max_values = _mm256_setzero_ps();
370 __m256i max_indices = _mm256_setzero_si256();
371
372 for (unsigned i = 0; i < num_points / 8u; ++i) {
373 __m256 in0 = _mm256_loadu_ps((float*)src0);
374 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
376 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
377 src0 += 8;
378 }
379
380 // determine maximum value and index in the result of the vectorized loop
381 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
382 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
383 _mm256_store_ps(max_values_buffer, max_values);
384 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
385
386 float max = 0.f;
387 uint32_t index = 0;
388 for (unsigned i = 0; i < 8; i++) {
389 if (max_values_buffer[i] > max) {
390 max = max_values_buffer[i];
391 index = max_indices_buffer[i];
392 }
393 }
394
395 // handle tail not processed by the vectorized loop
396 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
397 const float abs_squared =
398 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
399 if (abs_squared > max) {
400 max = abs_squared;
401 index = i;
402 }
403 ++src0;
404 }
405
406 *target = index;
407}
408
409#endif /*LV_HAVE_AVX2*/
410
411#ifdef LV_HAVE_AVX2
412#include <immintrin.h>
414
415static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
416 const lv_32fc_t* src0,
417 uint32_t num_points)
418{
419 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
420
421 const __m256i indices_increment = _mm256_set1_epi32(8);
422 /*
423 * At the start of each loop iteration current_indices holds the indices of
424 * the complex numbers loaded from memory. Explanation for odd order is given
425 * in implementation of vector_32fc_index_max_variant0().
426 */
427 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
428
429 __m256 max_values = _mm256_setzero_ps();
430 __m256i max_indices = _mm256_setzero_si256();
431
432 for (unsigned i = 0; i < num_points / 8u; ++i) {
433 __m256 in0 = _mm256_loadu_ps((float*)src0);
434 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
436 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
437 src0 += 8;
438 }
439
440 // determine maximum value and index in the result of the vectorized loop
441 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
442 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
443 _mm256_store_ps(max_values_buffer, max_values);
444 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
445
446 float max = 0.f;
447 uint32_t index = 0;
448 for (unsigned i = 0; i < 8; i++) {
449 if (max_values_buffer[i] > max) {
450 max = max_values_buffer[i];
451 index = max_indices_buffer[i];
452 }
453 }
454
455 // handle tail not processed by the vectorized loop
456 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
457 const float abs_squared =
458 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
459 if (abs_squared > max) {
460 max = abs_squared;
461 index = i;
462 }
463 ++src0;
464 }
465
466 *target = index;
467}
468
469#endif /*LV_HAVE_AVX2*/
470
471#ifdef LV_HAVE_RVV
472#include <float.h>
473#include <riscv_vector.h>
474
475static inline void
476volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
477{
478 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
479 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
480 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
481 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
482 for (size_t vl; n > 0; n -= vl, src0 += vl) {
483 vl = __riscv_vsetvl_e32m4(n);
484 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
485 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
486 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
487 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
488 vbool8_t m = __riscv_vmflt(vmax, v, vl);
489 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
490 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
491 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
492 }
493 size_t vl = __riscv_vsetvlmax_e32m4();
494 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
495 __riscv_vfmv_v_f_f32m1(0, 1),
496 __riscv_vsetvlmax_e32m1()));
497 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
498 *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
499}
500#endif /*LV_HAVE_RVV*/
501
502#ifdef LV_HAVE_RVVSEG
503#include <float.h>
504#include <riscv_vector.h>
505
506static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target,
507 const lv_32fc_t* src0,
508 uint32_t num_points)
509{
510 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
511 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
512 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
513 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
514 for (size_t vl; n > 0; n -= vl, src0 += vl) {
515 vl = __riscv_vsetvl_e32m4(n);
516 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
517 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
518 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
519 vbool8_t m = __riscv_vmflt(vmax, v, vl);
520 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
521 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
522 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
523 }
524 size_t vl = __riscv_vsetvlmax_e32m4();
525 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
526 __riscv_vfmv_v_f_f32m1(0, 1),
527 __riscv_vsetvlmax_e32m1()));
528 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
529 *target = __riscv_vmv_x(__riscv_vslidedown(vmaxi, __riscv_vfirst(m, vl), vl));
530}
531#endif /*LV_HAVE_RVVSEG*/
532
533#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
Definition volk_common.h:116
float f[4]
Definition volk_common.h:120
__m128i int_vec
Definition volk_common.h:128
uint32_t i[4]
Definition volk_common.h:119
__m128 float_vec
Definition volk_common.h:124
static void volk_32fc_index_max_16u_generic(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition volk_32fc_index_max_16u.h:312
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, const lv_32fc_t *src0, uint32_t num_points)
Definition volk_32fc_index_max_16u.h:196
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:203
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition volk_avx2_intrinsics.h:141
#define bit128_p(x)
Definition volk_common.h:147
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
for i
Definition volk_config_fixed.tmpl.h:13
#define RISCV_SHRINK4(op, T, S, v)
Definition volk_rvv_intrinsics.h:24