60#ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
61#define INCLUDED_volk_32f_s32f_x2_convert_8u_u_H
67 const float min_val = 0.0f;
68 const float max_val = UINT8_MAX;
70 *out = (uint8_t)(max_val);
71 }
else if (in < min_val) {
72 *out = (uint8_t)(min_val);
74 *out = (uint8_t)(
rintf(in));
82 const float* inputVector,
85 unsigned int num_points)
87 const float* inputVectorPtr = inputVector;
89 for (
unsigned int number = 0; number < num_points; number++) {
90 const float r = *inputVectorPtr++ * scale + bias;
98#if LV_HAVE_AVX2 && LV_HAVE_FMA
101static inline void volk_32f_s32f_x2_convert_8u_u_avx2_fma(uint8_t* outputVector,
102 const float* inputVector,
105 unsigned int num_points)
107 const unsigned int thirtysecondPoints = num_points / 32;
109 const float* inputVectorPtr = (
const float*)inputVector;
110 uint8_t* outputVectorPtr = outputVector;
112 const float min_val = 0.0f;
113 const float max_val = UINT8_MAX;
114 const __m256 vmin_val = _mm256_set1_ps(min_val);
115 const __m256 vmax_val = _mm256_set1_ps(max_val);
117 const __m256 vScale = _mm256_set1_ps(scale);
118 const __m256 vBias = _mm256_set1_ps(bias);
120 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
121 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
123 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
125 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
127 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
130 inputVal1 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
132 inputVal2 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
134 inputVal3 = _mm256_max_ps(
135 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
136 inputVal4 = _mm256_max_ps(
137 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
139 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
140 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
141 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
142 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
144 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
145 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
146 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
147 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
149 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
150 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
152 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
153 outputVectorPtr += 32;
156 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
157 const float r = inputVector[number] * scale + bias;
166#include <immintrin.h>
168static inline void volk_32f_s32f_x2_convert_8u_u_avx2(uint8_t* outputVector,
169 const float* inputVector,
172 unsigned int num_points)
174 const unsigned int thirtysecondPoints = num_points / 32;
176 const float* inputVectorPtr = (
const float*)inputVector;
177 uint8_t* outputVectorPtr = outputVector;
179 const float min_val = 0.0f;
180 const float max_val = UINT8_MAX;
181 const __m256 vmin_val = _mm256_set1_ps(min_val);
182 const __m256 vmax_val = _mm256_set1_ps(max_val);
184 const __m256 vScale = _mm256_set1_ps(scale);
185 const __m256 vBias = _mm256_set1_ps(bias);
187 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
188 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
190 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
192 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
194 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
197 inputVal1 = _mm256_max_ps(
198 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
201 inputVal2 = _mm256_max_ps(
202 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
205 inputVal3 = _mm256_max_ps(
206 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
209 inputVal4 = _mm256_max_ps(
210 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
214 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
215 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
216 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
217 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
219 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
220 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
221 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
222 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
224 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
225 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
227 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
228 outputVectorPtr += 32;
231 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
232 float r = inputVector[number] * scale + bias;
241#include <emmintrin.h>
244 const float* inputVector,
247 unsigned int num_points)
249 const unsigned int sixteenthPoints = num_points / 16;
251 const float* inputVectorPtr = (
const float*)inputVector;
252 uint8_t* outputVectorPtr = outputVector;
254 const float min_val = 0.0f;
255 const float max_val = UINT8_MAX;
256 const __m128 vmin_val = _mm_set_ps1(min_val);
257 const __m128 vmax_val = _mm_set_ps1(max_val);
259 const __m128 vScale = _mm_set_ps1(scale);
260 const __m128 vBias = _mm_set_ps1(bias);
262 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
263 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
265 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
267 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
269 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
272 inputVal1 = _mm_max_ps(
273 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
275 inputVal2 = _mm_max_ps(
276 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
278 inputVal3 = _mm_max_ps(
279 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
281 inputVal4 = _mm_max_ps(
282 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
285 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
286 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
287 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
288 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
290 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
291 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
293 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
295 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
296 outputVectorPtr += 16;
299 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
300 const float r = inputVector[number] * scale + bias;
309#include <xmmintrin.h>
312 const float* inputVector,
315 unsigned int num_points)
317 const unsigned int quarterPoints = num_points / 4;
319 const float* inputVectorPtr = (
const float*)inputVector;
320 uint8_t* outputVectorPtr = outputVector;
322 const float min_val = 0.0f;
323 const float max_val = UINT8_MAX;
324 const __m128 vmin_val = _mm_set_ps1(min_val);
325 const __m128 vmax_val = _mm_set_ps1(max_val);
327 const __m128 vScale = _mm_set_ps1(scale);
328 const __m128 vBias = _mm_set_ps1(bias);
332 for (
unsigned int number = 0; number < quarterPoints; number++) {
333 __m128 ret = _mm_loadu_ps(inputVectorPtr);
336 ret = _mm_max_ps(_mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScale), vBias), vmax_val),
339 _mm_store_ps(outputFloatBuffer, ret);
340 for (
size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
341 *outputVectorPtr++ = (uint8_t)(
rintf(outputFloatBuffer[inner_loop]));
345 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
346 const float r = inputVector[number] * scale + bias;
355#ifndef INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
356#define INCLUDED_volk_32f_s32f_x2_convert_8u_a_H
361#if LV_HAVE_AVX2 && LV_HAVE_FMA
362#include <immintrin.h>
364static inline void volk_32f_s32f_x2_convert_8u_a_avx2_fma(uint8_t* outputVector,
365 const float* inputVector,
368 unsigned int num_points)
370 const unsigned int thirtysecondPoints = num_points / 32;
372 const float* inputVectorPtr = (
const float*)inputVector;
373 uint8_t* outputVectorPtr = outputVector;
375 const float min_val = 0.0f;
376 const float max_val = UINT8_MAX;
377 const __m256 vmin_val = _mm256_set1_ps(min_val);
378 const __m256 vmax_val = _mm256_set1_ps(max_val);
380 const __m256 vScale = _mm256_set1_ps(scale);
381 const __m256 vBias = _mm256_set1_ps(bias);
383 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
384 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
386 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
388 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
390 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
393 inputVal1 = _mm256_max_ps(
394 _mm256_min_ps(_mm256_fmadd_ps(inputVal1, vScale, vBias), vmax_val), vmin_val);
395 inputVal2 = _mm256_max_ps(
396 _mm256_min_ps(_mm256_fmadd_ps(inputVal2, vScale, vBias), vmax_val), vmin_val);
397 inputVal3 = _mm256_max_ps(
398 _mm256_min_ps(_mm256_fmadd_ps(inputVal3, vScale, vBias), vmax_val), vmin_val);
399 inputVal4 = _mm256_max_ps(
400 _mm256_min_ps(_mm256_fmadd_ps(inputVal4, vScale, vBias), vmax_val), vmin_val);
402 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
403 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
404 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
405 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
407 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
408 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
409 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
410 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
412 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
413 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
415 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
416 outputVectorPtr += 32;
419 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
420 const float r = inputVector[number] * scale + bias;
429#include <immintrin.h>
431static inline void volk_32f_s32f_x2_convert_8u_a_avx2(uint8_t* outputVector,
432 const float* inputVector,
435 unsigned int num_points)
437 const unsigned int thirtysecondPoints = num_points / 32;
439 const float* inputVectorPtr = (
const float*)inputVector;
440 uint8_t* outputVectorPtr = outputVector;
442 const float min_val = 0.0f;
443 const float max_val = UINT8_MAX;
444 const __m256 vmin_val = _mm256_set1_ps(min_val);
445 const __m256 vmax_val = _mm256_set1_ps(max_val);
447 const __m256 vScale = _mm256_set1_ps(scale);
448 const __m256 vBias = _mm256_set1_ps(bias);
450 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
451 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
453 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
455 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
457 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
460 inputVal1 = _mm256_max_ps(
461 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal1, vScale), vBias),
464 inputVal2 = _mm256_max_ps(
465 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal2, vScale), vBias),
468 inputVal3 = _mm256_max_ps(
469 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal3, vScale), vBias),
472 inputVal4 = _mm256_max_ps(
473 _mm256_min_ps(_mm256_add_ps(_mm256_mul_ps(inputVal4, vScale), vBias),
477 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
478 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
479 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
480 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
482 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
483 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
484 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
485 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
487 intInputVal1 = _mm256_packus_epi16(intInputVal1, intInputVal3);
488 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
490 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
491 outputVectorPtr += 32;
494 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
495 const float r = inputVector[number] * scale + bias;
504#include <emmintrin.h>
507 const float* inputVector,
510 unsigned int num_points)
512 const unsigned int sixteenthPoints = num_points / 16;
514 const float* inputVectorPtr = (
const float*)inputVector;
515 uint8_t* outputVectorPtr = outputVector;
517 const float min_val = 0.0f;
518 const float max_val = UINT8_MAX;
519 const __m128 vmin_val = _mm_set_ps1(min_val);
520 const __m128 vmax_val = _mm_set_ps1(max_val);
522 const __m128 vScale = _mm_set_ps1(scale);
523 const __m128 vBias = _mm_set_ps1(bias);
525 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
526 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
528 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
530 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
532 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
535 inputVal1 = _mm_max_ps(
536 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal1, vScale), vBias), vmax_val),
538 inputVal2 = _mm_max_ps(
539 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal2, vScale), vBias), vmax_val),
541 inputVal3 = _mm_max_ps(
542 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal3, vScale), vBias), vmax_val),
544 inputVal4 = _mm_max_ps(
545 _mm_min_ps(_mm_add_ps(_mm_mul_ps(inputVal4, vScale), vBias), vmax_val),
548 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
549 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
550 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
551 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
553 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
554 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
556 intInputVal1 = _mm_packus_epi16(intInputVal1, intInputVal3);
558 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
559 outputVectorPtr += 16;
562 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
563 const float r = inputVector[number] * scale + bias;
571#include <xmmintrin.h>
574 const float* inputVector,
577 unsigned int num_points)
579 const unsigned int quarterPoints = num_points / 4;
581 const float* inputVectorPtr = (
const float*)inputVector;
582 uint8_t* outputVectorPtr = outputVector;
584 const float min_val = 0.0f;
585 const float max_val = UINT8_MAX;
586 const __m128 vmin_val = _mm_set_ps1(min_val);
587 const __m128 vmax_val = _mm_set_ps1(max_val);
589 const __m128 vScalar = _mm_set_ps1(scale);
590 const __m128 vBias = _mm_set_ps1(bias);
594 for (
unsigned int number = 0; number < quarterPoints; number++) {
595 __m128 ret = _mm_load_ps(inputVectorPtr);
599 _mm_min_ps(_mm_add_ps(_mm_mul_ps(ret, vScalar), vBias), vmax_val), vmin_val);
601 _mm_store_ps(outputFloatBuffer, ret);
602 for (
size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
603 *outputVectorPtr++ = (uint8_t)(
rintf(outputFloatBuffer[inner_loop]));
607 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
608 const float r = inputVector[number] * scale + bias;
616#include <riscv_vector.h>
618static inline void volk_32f_s32f_x2_convert_8u_rvv(uint8_t* outputVector,
619 const float* inputVector,
622 unsigned int num_points)
624 vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(bias, __riscv_vsetvlmax_e32m8());
625 size_t n = num_points;
626 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
627 vl = __riscv_vsetvl_e32m8(n);
628 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
629 vuint16m4_t vi = __riscv_vfncvt_xu(__riscv_vfmadd_vf_f32m8(v, scale, vb, vl), vl);
630 __riscv_vse8(outputVector, __riscv_vnclipu(vi, 0, 0, vl), vl);
static float rintf(float x)
Definition config.h:45
static void volk_32f_s32f_x2_convert_8u_generic(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition volk_32f_s32f_x2_convert_8u.h:81
static void volk_32f_s32f_x2_convert_8u_a_sse2(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition volk_32f_s32f_x2_convert_8u.h:506
static void volk_32f_s32f_x2_convert_8u_a_sse(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition volk_32f_s32f_x2_convert_8u.h:573
static void volk_32f_s32f_x2_convert_8u_u_sse(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition volk_32f_s32f_x2_convert_8u.h:311
static void volk_32f_s32f_x2_convert_8u_single(uint8_t *out, const float in)
Definition volk_32f_s32f_x2_convert_8u.h:65
static void volk_32f_s32f_x2_convert_8u_u_sse2(uint8_t *outputVector, const float *inputVector, const float scale, const float bias, unsigned int num_points)
Definition volk_32f_s32f_x2_convert_8u.h:243
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62