Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
32
33#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34#define INCLUDED_volk_32fc_convert_16ic_a_H
35
36#include "volk/volk_complex.h"
37#include <limits.h>
38#include <math.h>
39
40#ifdef LV_HAVE_AVX2
41#include <immintrin.h>
42
43static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44 const lv_32fc_t* inputVector,
45 unsigned int num_points)
46{
47 const unsigned int avx_iters = num_points / 8;
48
49 float* inputVectorPtr = (float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
51 float aux;
52
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
55
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
58 __m256 ret1, ret2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
61 unsigned int i;
62
63 for (i = 0; i < avx_iters; i++) {
64 inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65 inputVectorPtr += 8;
66 inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67 inputVectorPtr += 8;
68 __VOLK_PREFETCH(inputVectorPtr + 16);
69
70 // Clip
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
76
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
82 }
83
84 for (i = avx_iters * 16; i < num_points * 2; i++) {
85 aux = *inputVectorPtr++;
86 if (aux > max_val)
87 aux = max_val;
88 else if (aux < min_val)
89 aux = min_val;
90 *outputVectorPtr++ = (int16_t)rintf(aux);
91 }
92}
93#endif /* LV_HAVE_AVX2 */
94
95#ifdef LV_HAVE_SSE2
96#include <emmintrin.h>
97
98static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
99 const lv_32fc_t* inputVector,
100 unsigned int num_points)
101{
102 const unsigned int sse_iters = num_points / 4;
103
104 float* inputVectorPtr = (float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
106 float aux;
107
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
110
111 __m128 inputVal1, inputVal2;
112 __m128i intInputVal1, intInputVal2;
113 __m128 ret1, ret2;
114 const __m128 vmin_val = _mm_set_ps1(min_val);
115 const __m128 vmax_val = _mm_set_ps1(max_val);
116 unsigned int i;
117
118 for (i = 0; i < sse_iters; i++) {
119 inputVal1 = _mm_load_ps((float*)inputVectorPtr);
120 inputVectorPtr += 4;
121 inputVal2 = _mm_load_ps((float*)inputVectorPtr);
122 inputVectorPtr += 4;
123 __VOLK_PREFETCH(inputVectorPtr + 8);
124
125 // Clip
126 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
127 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
128
129 intInputVal1 = _mm_cvtps_epi32(ret1);
130 intInputVal2 = _mm_cvtps_epi32(ret2);
131
132 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
133
134 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
135 outputVectorPtr += 8;
136 }
137
138 for (i = sse_iters * 8; i < num_points * 2; i++) {
139 aux = *inputVectorPtr++;
140 if (aux > max_val)
141 aux = max_val;
142 else if (aux < min_val)
143 aux = min_val;
144 *outputVectorPtr++ = (int16_t)rintf(aux);
145 }
146}
147#endif /* LV_HAVE_SSE2 */
148
149
150#if LV_HAVE_NEONV7
151#include <arm_neon.h>
152
153static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
154 const lv_32fc_t* inputVector,
155 unsigned int num_points)
156{
157
158 const unsigned int neon_iters = num_points / 4;
159
160 float32_t* inputVectorPtr = (float32_t*)inputVector;
161 int16_t* outputVectorPtr = (int16_t*)outputVector;
162
163 const float min_val_f = (float)SHRT_MIN;
164 const float max_val_f = (float)SHRT_MAX;
165 float32_t aux;
166 unsigned int i;
167
168 const float32x4_t min_val = vmovq_n_f32(min_val_f);
169 const float32x4_t max_val = vmovq_n_f32(max_val_f);
170 float32x4_t half = vdupq_n_f32(0.5f);
171 float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
172
173 int32x4_t toint_a = { 0, 0, 0, 0 };
174 int32x4_t toint_b = { 0, 0, 0, 0 };
175 int16x4_t intInputVal1, intInputVal2;
176 int16x8_t res;
177
178 for (i = 0; i < neon_iters; i++) {
179 a = vld1q_f32((const float32_t*)(inputVectorPtr));
180 inputVectorPtr += 4;
181 b = vld1q_f32((const float32_t*)(inputVectorPtr));
182 inputVectorPtr += 4;
183 __VOLK_PREFETCH(inputVectorPtr + 8);
184
185 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
186 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
187
188 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
189 PlusHalf = vaddq_f32(ret1, half);
190 Round = vsubq_f32(PlusHalf, sign);
191 toint_a = vcvtq_s32_f32(Round);
192
193 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
194 PlusHalf = vaddq_f32(ret2, half);
195 Round = vsubq_f32(PlusHalf, sign);
196 toint_b = vcvtq_s32_f32(Round);
197
198 intInputVal1 = vqmovn_s32(toint_a);
199 intInputVal2 = vqmovn_s32(toint_b);
200
201 res = vcombine_s16(intInputVal1, intInputVal2);
202 vst1q_s16((int16_t*)outputVectorPtr, res);
203 outputVectorPtr += 8;
204 }
205
206 for (i = neon_iters * 8; i < num_points * 2; i++) {
207 aux = *inputVectorPtr++;
208 if (aux > max_val_f)
209 aux = max_val_f;
210 else if (aux < min_val_f)
211 aux = min_val_f;
212 *outputVectorPtr++ = (int16_t)rintf(aux);
213 }
214}
215
216#endif /* LV_HAVE_NEONV7 */
217
218#if LV_HAVE_NEONV8
219#include <arm_neon.h>
220
221static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
222 const lv_32fc_t* inputVector,
223 unsigned int num_points)
224{
225 const unsigned int neon_iters = num_points / 4;
226
227 float32_t* inputVectorPtr = (float32_t*)inputVector;
228 int16_t* outputVectorPtr = (int16_t*)outputVector;
229
230 const float min_val_f = (float)SHRT_MIN;
231 const float max_val_f = (float)SHRT_MAX;
232 float32_t aux;
233 unsigned int i;
234
235 const float32x4_t min_val = vmovq_n_f32(min_val_f);
236 const float32x4_t max_val = vmovq_n_f32(max_val_f);
237 float32x4_t ret1, ret2, a, b;
238
239 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240 int16x4_t intInputVal1, intInputVal2;
241 int16x8_t res;
242
243 for (i = 0; i < neon_iters; i++) {
244 a = vld1q_f32((const float32_t*)(inputVectorPtr));
245 inputVectorPtr += 4;
246 b = vld1q_f32((const float32_t*)(inputVectorPtr));
247 inputVectorPtr += 4;
248 __VOLK_PREFETCH(inputVectorPtr + 8);
249
250 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
252
253 // vrndiq takes into account the current rounding mode (as does rintf)
254 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
256
257 intInputVal1 = vqmovn_s32(toint_a);
258 intInputVal2 = vqmovn_s32(toint_b);
259
260 res = vcombine_s16(intInputVal1, intInputVal2);
261 vst1q_s16((int16_t*)outputVectorPtr, res);
262 outputVectorPtr += 8;
263 }
264
265 for (i = neon_iters * 8; i < num_points * 2; i++) {
266 aux = *inputVectorPtr++;
267 if (aux > max_val_f)
268 aux = max_val_f;
269 else if (aux < min_val_f)
270 aux = min_val_f;
271 *outputVectorPtr++ = (int16_t)rintf(aux);
272 }
273}
274#endif /* LV_HAVE_NEONV8 */
275
276
277#ifdef LV_HAVE_GENERIC
278
279static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
280 const lv_32fc_t* inputVector,
281 unsigned int num_points)
282{
283 float* inputVectorPtr = (float*)inputVector;
284 int16_t* outputVectorPtr = (int16_t*)outputVector;
285 const float min_val = (float)SHRT_MIN;
286 const float max_val = (float)SHRT_MAX;
287 float aux;
288 unsigned int i;
289 for (i = 0; i < num_points * 2; i++) {
290 aux = *inputVectorPtr++;
291 if (aux > max_val)
292 aux = max_val;
293 else if (aux < min_val)
294 aux = min_val;
295 *outputVectorPtr++ = (int16_t)rintf(aux);
296 }
297}
298#endif /* LV_HAVE_GENERIC */
299
300#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
301
302#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303#define INCLUDED_volk_32fc_convert_16ic_u_H
304
305#include "volk/volk_complex.h"
306#include <limits.h>
307#include <math.h>
308
309
310#ifdef LV_HAVE_AVX2
311#include <immintrin.h>
312
313static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
314 const lv_32fc_t* inputVector,
315 unsigned int num_points)
316{
317 const unsigned int avx_iters = num_points / 8;
318
319 float* inputVectorPtr = (float*)inputVector;
320 int16_t* outputVectorPtr = (int16_t*)outputVector;
321 float aux;
322
323 const float min_val = (float)SHRT_MIN;
324 const float max_val = (float)SHRT_MAX;
325
326 __m256 inputVal1, inputVal2;
327 __m256i intInputVal1, intInputVal2;
328 __m256 ret1, ret2;
329 const __m256 vmin_val = _mm256_set1_ps(min_val);
330 const __m256 vmax_val = _mm256_set1_ps(max_val);
331 unsigned int i;
332
333 for (i = 0; i < avx_iters; i++) {
334 inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
335 inputVectorPtr += 8;
336 inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
337 inputVectorPtr += 8;
338 __VOLK_PREFETCH(inputVectorPtr + 16);
339
340 // Clip
341 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
343
344 intInputVal1 = _mm256_cvtps_epi32(ret1);
345 intInputVal2 = _mm256_cvtps_epi32(ret2);
346
347 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
349
350 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351 outputVectorPtr += 16;
352 }
353
354 for (i = avx_iters * 16; i < num_points * 2; i++) {
355 aux = *inputVectorPtr++;
356 if (aux > max_val)
357 aux = max_val;
358 else if (aux < min_val)
359 aux = min_val;
360 *outputVectorPtr++ = (int16_t)rintf(aux);
361 }
362}
363#endif /* LV_HAVE_AVX2 */
364
365
366#ifdef LV_HAVE_SSE2
367#include <emmintrin.h>
368
369static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
370 const lv_32fc_t* inputVector,
371 unsigned int num_points)
372{
373 const unsigned int sse_iters = num_points / 4;
374
375 float* inputVectorPtr = (float*)inputVector;
376 int16_t* outputVectorPtr = (int16_t*)outputVector;
377 float aux;
378
379 const float min_val = (float)SHRT_MIN;
380 const float max_val = (float)SHRT_MAX;
381
382 __m128 inputVal1, inputVal2;
383 __m128i intInputVal1, intInputVal2;
384 __m128 ret1, ret2;
385 const __m128 vmin_val = _mm_set_ps1(min_val);
386 const __m128 vmax_val = _mm_set_ps1(max_val);
387
388 unsigned int i;
389 for (i = 0; i < sse_iters; i++) {
390 inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
391 inputVectorPtr += 4;
392 inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
393 inputVectorPtr += 4;
394 __VOLK_PREFETCH(inputVectorPtr + 8);
395
396 // Clip
397 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
398 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
399
400 intInputVal1 = _mm_cvtps_epi32(ret1);
401 intInputVal2 = _mm_cvtps_epi32(ret2);
402
403 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
404
405 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
406 outputVectorPtr += 8;
407 }
408
409 for (i = sse_iters * 8; i < num_points * 2; i++) {
410 aux = *inputVectorPtr++;
411 if (aux > max_val)
412 aux = max_val;
413 else if (aux < min_val)
414 aux = min_val;
415 *outputVectorPtr++ = (int16_t)rintf(aux);
416 }
417}
418#endif /* LV_HAVE_SSE2 */
419
420#ifdef LV_HAVE_RVV
421#include <riscv_vector.h>
422
423static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector,
424 const lv_32fc_t* inputVector,
425 unsigned int num_points)
426{
427 int16_t* out = (int16_t*)outputVector;
428 float* in = (float*)inputVector;
429 size_t n = num_points * 2;
430 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
431 vl = __riscv_vsetvl_e32m8(n);
432 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
433 __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);
434 }
435}
436#endif /*LV_HAVE_RVV*/
437
438#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static float rintf(float x)
Definition config.h:45
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:98
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:369
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:279
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
float complex lv_32fc_t
Definition volk_complex.h:74
short complex lv_16sc_t
Definition volk_complex.h:71
for i
Definition volk_config_fixed.tmpl.h:13