Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
58#define INCLUDED_volk_32f_s32f_convert_32i_u_H
59
60#include <inttypes.h>
61#include <limits.h>
62#include <stdio.h>
63
64#ifdef LV_HAVE_AVX
65#include <immintrin.h>
66
67static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
68 const float* inputVector,
69 const float scalar,
70 unsigned int num_points)
71{
72 unsigned int number = 0;
73
74 const unsigned int eighthPoints = num_points / 8;
75
76 const float* inputVectorPtr = (const float*)inputVector;
77 int32_t* outputVectorPtr = outputVector;
78
79 float min_val = INT_MIN;
80 float max_val = (uint32_t)INT_MAX + 1;
81 float r;
82
83 __m256 vScalar = _mm256_set1_ps(scalar);
84 __m256 inputVal1;
85 __m256i intInputVal1;
86 __m256 vmin_val = _mm256_set1_ps(min_val);
87 __m256 vmax_val = _mm256_set1_ps(max_val);
88
89 for (; number < eighthPoints; number++) {
90 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
91 inputVectorPtr += 8;
92
93 inputVal1 = _mm256_max_ps(
94 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
95 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
96
97 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
98 outputVectorPtr += 8;
99 }
100
101 number = eighthPoints * 8;
102 for (; number < num_points; number++) {
103 r = inputVector[number] * scalar;
104 if (r > max_val)
105 r = max_val;
106 else if (r < min_val)
107 r = min_val;
108 outputVector[number] = (int32_t)rintf(r);
109 }
110}
111
112#endif /* LV_HAVE_AVX */
113
114#ifdef LV_HAVE_SSE2
115#include <emmintrin.h>
116
117static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
118 const float* inputVector,
119 const float scalar,
120 unsigned int num_points)
121{
122 unsigned int number = 0;
123
124 const unsigned int quarterPoints = num_points / 4;
125
126 const float* inputVectorPtr = (const float*)inputVector;
127 int32_t* outputVectorPtr = outputVector;
128
129 float min_val = INT_MIN;
130 float max_val = (uint32_t)INT_MAX + 1;
131 float r;
132
133 __m128 vScalar = _mm_set_ps1(scalar);
134 __m128 inputVal1;
135 __m128i intInputVal1;
136 __m128 vmin_val = _mm_set_ps1(min_val);
137 __m128 vmax_val = _mm_set_ps1(max_val);
138
139 for (; number < quarterPoints; number++) {
140 inputVal1 = _mm_loadu_ps(inputVectorPtr);
141 inputVectorPtr += 4;
142
143 inputVal1 =
144 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
145 intInputVal1 = _mm_cvtps_epi32(inputVal1);
146
147 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
148 outputVectorPtr += 4;
149 }
150
151 number = quarterPoints * 4;
152 for (; number < num_points; number++) {
153 r = inputVector[number] * scalar;
154 if (r > max_val)
155 r = max_val;
156 else if (r < min_val)
157 r = min_val;
158 outputVector[number] = (int32_t)rintf(r);
159 }
160}
161
162#endif /* LV_HAVE_SSE2 */
163
164
165#ifdef LV_HAVE_SSE
166#include <xmmintrin.h>
167
168static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
169 const float* inputVector,
170 const float scalar,
171 unsigned int num_points)
172{
173 unsigned int number = 0;
174
175 const unsigned int quarterPoints = num_points / 4;
176
177 const float* inputVectorPtr = (const float*)inputVector;
178 int32_t* outputVectorPtr = outputVector;
179
180 float min_val = INT_MIN;
181 float max_val = (uint32_t)INT_MAX + 1;
182 float r;
183
184 __m128 vScalar = _mm_set_ps1(scalar);
185 __m128 ret;
186 __m128 vmin_val = _mm_set_ps1(min_val);
187 __m128 vmax_val = _mm_set_ps1(max_val);
188
189 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
190
191 for (; number < quarterPoints; number++) {
192 ret = _mm_loadu_ps(inputVectorPtr);
193 inputVectorPtr += 4;
194
195 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
196
197 _mm_store_ps(outputFloatBuffer, ret);
198 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
199 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
200 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
201 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
202 }
203
204 number = quarterPoints * 4;
205 for (; number < num_points; number++) {
206 r = inputVector[number] * scalar;
207 if (r > max_val)
208 r = max_val;
209 else if (r < min_val)
210 r = min_val;
211 outputVector[number] = (int32_t)rintf(r);
212 }
213}
214
215#endif /* LV_HAVE_SSE */
216
217
218#ifdef LV_HAVE_GENERIC
219
220static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
221 const float* inputVector,
222 const float scalar,
223 unsigned int num_points)
224{
225 int32_t* outputVectorPtr = outputVector;
226 const float* inputVectorPtr = inputVector;
227 const float min_val = (float)INT_MIN;
228 const float max_val = (float)((uint32_t)INT_MAX + 1);
229
230 for (unsigned int number = 0; number < num_points; number++) {
231 const float r = *inputVectorPtr++ * scalar;
232 int s;
233 if (r >= max_val)
234 s = INT_MAX;
235 else if (r < min_val)
236 s = INT_MIN;
237 else
238 s = (int32_t)rintf(r);
239 *outputVectorPtr++ = s;
240 }
241}
242
243#endif /* LV_HAVE_GENERIC */
244
245
246#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
247#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
248#define INCLUDED_volk_32f_s32f_convert_32i_a_H
249
250#include <inttypes.h>
251#include <stdio.h>
252#include <volk/volk_common.h>
253
254#ifdef LV_HAVE_AVX
255#include <immintrin.h>
256
257static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
258 const float* inputVector,
259 const float scalar,
260 unsigned int num_points)
261{
262 unsigned int number = 0;
263
264 const unsigned int eighthPoints = num_points / 8;
265
266 const float* inputVectorPtr = (const float*)inputVector;
267 int32_t* outputVectorPtr = outputVector;
268
269 float min_val = INT_MIN;
270 float max_val = (uint32_t)INT_MAX + 1;
271 float r;
272
273 __m256 vScalar = _mm256_set1_ps(scalar);
274 __m256 inputVal1;
275 __m256i intInputVal1;
276 __m256 vmin_val = _mm256_set1_ps(min_val);
277 __m256 vmax_val = _mm256_set1_ps(max_val);
278
279 for (; number < eighthPoints; number++) {
280 inputVal1 = _mm256_load_ps(inputVectorPtr);
281 inputVectorPtr += 8;
282
283 inputVal1 = _mm256_max_ps(
284 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
285 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
286
287 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
288 outputVectorPtr += 8;
289 }
290
291 number = eighthPoints * 8;
292 for (; number < num_points; number++) {
293 r = inputVector[number] * scalar;
294 if (r > max_val)
295 r = max_val;
296 else if (r < min_val)
297 r = min_val;
298 outputVector[number] = (int32_t)rintf(r);
299 }
300}
301
302#endif /* LV_HAVE_AVX */
303
304
305#ifdef LV_HAVE_SSE2
306#include <emmintrin.h>
307
308static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
309 const float* inputVector,
310 const float scalar,
311 unsigned int num_points)
312{
313 unsigned int number = 0;
314
315 const unsigned int quarterPoints = num_points / 4;
316
317 const float* inputVectorPtr = (const float*)inputVector;
318 int32_t* outputVectorPtr = outputVector;
319
320 float min_val = INT_MIN;
321 float max_val = (uint32_t)INT_MAX + 1;
322 float r;
323
324 __m128 vScalar = _mm_set_ps1(scalar);
325 __m128 inputVal1;
326 __m128i intInputVal1;
327 __m128 vmin_val = _mm_set_ps1(min_val);
328 __m128 vmax_val = _mm_set_ps1(max_val);
329
330 for (; number < quarterPoints; number++) {
331 inputVal1 = _mm_load_ps(inputVectorPtr);
332 inputVectorPtr += 4;
333
334 inputVal1 =
335 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
336 intInputVal1 = _mm_cvtps_epi32(inputVal1);
337
338 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
339 outputVectorPtr += 4;
340 }
341
342 number = quarterPoints * 4;
343 for (; number < num_points; number++) {
344 r = inputVector[number] * scalar;
345 if (r > max_val)
346 r = max_val;
347 else if (r < min_val)
348 r = min_val;
349 outputVector[number] = (int32_t)rintf(r);
350 }
351}
352
353#endif /* LV_HAVE_SSE2 */
354
355
356#ifdef LV_HAVE_SSE
357#include <xmmintrin.h>
358
359static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
360 const float* inputVector,
361 const float scalar,
362 unsigned int num_points)
363{
364 unsigned int number = 0;
365
366 const unsigned int quarterPoints = num_points / 4;
367
368 const float* inputVectorPtr = (const float*)inputVector;
369 int32_t* outputVectorPtr = outputVector;
370
371 float min_val = INT_MIN;
372 float max_val = (uint32_t)INT_MAX + 1;
373 float r;
374
375 __m128 vScalar = _mm_set_ps1(scalar);
376 __m128 ret;
377 __m128 vmin_val = _mm_set_ps1(min_val);
378 __m128 vmax_val = _mm_set_ps1(max_val);
379
380 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
381
382 for (; number < quarterPoints; number++) {
383 ret = _mm_load_ps(inputVectorPtr);
384 inputVectorPtr += 4;
385
386 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
387
388 _mm_store_ps(outputFloatBuffer, ret);
389 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
390 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
391 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
392 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
393 }
394
395 number = quarterPoints * 4;
396 for (; number < num_points; number++) {
397 r = inputVector[number] * scalar;
398 if (r > max_val)
399 r = max_val;
400 else if (r < min_val)
401 r = min_val;
402 outputVector[number] = (int32_t)rintf(r);
403 }
404}
405
406#endif /* LV_HAVE_SSE */
407
408#ifdef LV_HAVE_RVV
409#include <riscv_vector.h>
410
411static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector,
412 const float* inputVector,
413 const float scalar,
414 unsigned int num_points)
415{
416 size_t n = num_points;
417 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
418 vl = __riscv_vsetvl_e32m8(n);
419 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
420 v = __riscv_vfmul(v, scalar, vl);
421 __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl);
422 }
423}
424#endif /*LV_HAVE_RVV*/
425
426#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
static float rintf(float x)
Definition config.h:45
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:359
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:257
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:308
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:168
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:220
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:67
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition volk_32f_s32f_convert_32i.h:117
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62