Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61#define INCLUDED_volk_32f_s32f_convert_8i_u_H
62
63#include <inttypes.h>
64
65static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
66{
67 const float min_val = INT8_MIN;
68 const float max_val = INT8_MAX;
69 if (in > max_val) {
70 *out = (int8_t)(max_val);
71 } else if (in < min_val) {
72 *out = (int8_t)(min_val);
73 } else {
74 *out = (int8_t)(rintf(in));
75 }
76}
77
78#ifdef LV_HAVE_GENERIC
79
80static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
81 const float* inputVector,
82 const float scalar,
83 unsigned int num_points)
84{
85 const float* inputVectorPtr = inputVector;
86
87 for (unsigned int number = 0; number < num_points; number++) {
88 const float r = *inputVectorPtr++ * scalar;
89 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
90 }
91}
92
93#endif /* LV_HAVE_GENERIC */
94
95
96#ifdef LV_HAVE_AVX2
97#include <immintrin.h>
98
99static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
100 const float* inputVector,
101 const float scalar,
102 unsigned int num_points)
103{
104 const unsigned int thirtysecondPoints = num_points / 32;
105
106 const float* inputVectorPtr = (const float*)inputVector;
107 int8_t* outputVectorPtr = outputVector;
108
109 const float min_val = INT8_MIN;
110 const float max_val = INT8_MAX;
111 const __m256 vmin_val = _mm256_set1_ps(min_val);
112 const __m256 vmax_val = _mm256_set1_ps(max_val);
113
114 const __m256 vScalar = _mm256_set1_ps(scalar);
115
116 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
117 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
118 inputVectorPtr += 8;
119 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
120 inputVectorPtr += 8;
121 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
122 inputVectorPtr += 8;
123 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
124 inputVectorPtr += 8;
125
126 inputVal1 = _mm256_max_ps(
127 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
128 inputVal2 = _mm256_max_ps(
129 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
130 inputVal3 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
132 inputVal4 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
134
135 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
136 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
137 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
138 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
139
140 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
141 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
142 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
143 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
144
145 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
146 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
147
148 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
149 outputVectorPtr += 32;
150 }
151
152 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
153 float r = inputVector[number] * scalar;
154 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
155 }
156}
157
158#endif /* LV_HAVE_AVX2 */
159
160
161#ifdef LV_HAVE_SSE2
162#include <emmintrin.h>
163
164static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
165 const float* inputVector,
166 const float scalar,
167 unsigned int num_points)
168{
169 const unsigned int sixteenthPoints = num_points / 16;
170
171 const float* inputVectorPtr = (const float*)inputVector;
172 int8_t* outputVectorPtr = outputVector;
173
174 const float min_val = INT8_MIN;
175 const float max_val = INT8_MAX;
176 const __m128 vmin_val = _mm_set_ps1(min_val);
177 const __m128 vmax_val = _mm_set_ps1(max_val);
178
179 const __m128 vScalar = _mm_set_ps1(scalar);
180
181 for (unsigned int number = 0; number < sixteenthPoints; number++) {
182 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
183 inputVectorPtr += 4;
184 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
185 inputVectorPtr += 4;
186 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
187 inputVectorPtr += 4;
188 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
189 inputVectorPtr += 4;
190
191 inputVal1 =
192 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
193 inputVal2 =
194 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
195 inputVal3 =
196 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
197 inputVal4 =
198 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
199
200 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
201 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
202 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
203 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
204
205 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
206 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
207
208 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
209
210 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
211 outputVectorPtr += 16;
212 }
213
214 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
215 const float r = inputVector[number] * scalar;
216 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
217 }
218}
219
220#endif /* LV_HAVE_SSE2 */
221
222
223#ifdef LV_HAVE_SSE
224#include <xmmintrin.h>
225
226static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
227 const float* inputVector,
228 const float scalar,
229 unsigned int num_points)
230{
231 const unsigned int quarterPoints = num_points / 4;
232
233 const float* inputVectorPtr = (const float*)inputVector;
234 int8_t* outputVectorPtr = outputVector;
235
236 const float min_val = INT8_MIN;
237 const float max_val = INT8_MAX;
238 const __m128 vmin_val = _mm_set_ps1(min_val);
239 const __m128 vmax_val = _mm_set_ps1(max_val);
240
241 const __m128 vScalar = _mm_set_ps1(scalar);
242
243 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
244
245 for (unsigned int number = 0; number < quarterPoints; number++) {
246 __m128 ret = _mm_loadu_ps(inputVectorPtr);
247 inputVectorPtr += 4;
248
249 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
250
251 _mm_store_ps(outputFloatBuffer, ret);
252 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
253 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
254 }
255 }
256
257 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
258 const float r = inputVector[number] * scalar;
259 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
260 }
261}
262
263#endif /* LV_HAVE_SSE */
264
265
266#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
267#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
268#define INCLUDED_volk_32f_s32f_convert_8i_a_H
269
270#include <inttypes.h>
271
272#ifdef LV_HAVE_AVX2
273#include <immintrin.h>
274
275static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
276 const float* inputVector,
277 const float scalar,
278 unsigned int num_points)
279{
280 const unsigned int thirtysecondPoints = num_points / 32;
281
282 const float* inputVectorPtr = (const float*)inputVector;
283 int8_t* outputVectorPtr = outputVector;
284
285 const float min_val = INT8_MIN;
286 const float max_val = INT8_MAX;
287 const __m256 vmin_val = _mm256_set1_ps(min_val);
288 const __m256 vmax_val = _mm256_set1_ps(max_val);
289
290 const __m256 vScalar = _mm256_set1_ps(scalar);
291
292 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
293 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
294 inputVectorPtr += 8;
295 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
296 inputVectorPtr += 8;
297 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
298 inputVectorPtr += 8;
299 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
300 inputVectorPtr += 8;
301
302 inputVal1 = _mm256_max_ps(
303 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
304 inputVal2 = _mm256_max_ps(
305 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
306 inputVal3 = _mm256_max_ps(
307 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
308 inputVal4 = _mm256_max_ps(
309 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
310
311 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
312 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
313 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
314 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
315
316 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
317 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
318 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
319 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
320
321 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
322 __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
323
324 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
325 outputVectorPtr += 32;
326 }
327
328 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
329 const float r = inputVector[number] * scalar;
330 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
331 }
332}
333
334#endif /* LV_HAVE_AVX2 */
335
336
337#ifdef LV_HAVE_SSE2
338#include <emmintrin.h>
339
340static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
341 const float* inputVector,
342 const float scalar,
343 unsigned int num_points)
344{
345 const unsigned int sixteenthPoints = num_points / 16;
346
347 const float* inputVectorPtr = (const float*)inputVector;
348 int8_t* outputVectorPtr = outputVector;
349
350 const float min_val = INT8_MIN;
351 const float max_val = INT8_MAX;
352 const __m128 vmin_val = _mm_set_ps1(min_val);
353 const __m128 vmax_val = _mm_set_ps1(max_val);
354
355 const __m128 vScalar = _mm_set_ps1(scalar);
356
357 for (unsigned int number = 0; number < sixteenthPoints; number++) {
358 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
359 inputVectorPtr += 4;
360 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
361 inputVectorPtr += 4;
362 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
363 inputVectorPtr += 4;
364 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
365 inputVectorPtr += 4;
366
367 inputVal1 =
368 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
369 inputVal2 =
370 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
371 inputVal3 =
372 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
373 inputVal4 =
374 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
375
376 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
377 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
378 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
379 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
380
381 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
382 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
383
384 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
385
386 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
387 outputVectorPtr += 16;
388 }
389
390 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
391 const float r = inputVector[number] * scalar;
392 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
393 }
394}
395#endif /* LV_HAVE_SSE2 */
396
397
398#ifdef LV_HAVE_SSE
399#include <xmmintrin.h>
400
401static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
402 const float* inputVector,
403 const float scalar,
404 unsigned int num_points)
405{
406 const unsigned int quarterPoints = num_points / 4;
407
408 const float* inputVectorPtr = (const float*)inputVector;
409 int8_t* outputVectorPtr = outputVector;
410
411 const float min_val = INT8_MIN;
412 const float max_val = INT8_MAX;
413 const __m128 vmin_val = _mm_set_ps1(min_val);
414 const __m128 vmax_val = _mm_set_ps1(max_val);
415
416 const __m128 vScalar = _mm_set_ps1(scalar);
417
418 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
419
420 for (unsigned int number = 0; number < quarterPoints; number++) {
421 __m128 ret = _mm_load_ps(inputVectorPtr);
422 inputVectorPtr += 4;
423
424 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
425
426 _mm_store_ps(outputFloatBuffer, ret);
427 for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
428 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
429 }
430 }
431
432 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
433 const float r = inputVector[number] * scalar;
434 volk_32f_s32f_convert_8i_single(&outputVector[number], r);
435 }
436}
437
438#endif /* LV_HAVE_SSE */
439
440#ifdef LV_HAVE_RVV
441#include <riscv_vector.h>
442
443static inline void volk_32f_s32f_convert_8i_rvv(int8_t* outputVector,
444 const float* inputVector,
445 const float scalar,
446 unsigned int num_points)
447{
448 size_t n = num_points;
449 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
450 vl = __riscv_vsetvl_e32m8(n);
451 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
452 vint16m4_t vi = __riscv_vfncvt_x(__riscv_vfmul(v, scalar, vl), vl);
453 __riscv_vse8(outputVector, __riscv_vnclip(vi, 0, 0, vl), vl);
454 }
455}
456#endif /*LV_HAVE_RVV*/
457
458#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static float rintf(float x)
Definition config.h:45
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_8i.h:340
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition volk_32f_s32f_convert_8i.h:65
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_8i.h:226
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_8i.h:401
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_8i.h:80
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_8i.h:164
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62