33#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34#define INCLUDED_volk_32fc_convert_16ic_a_H
43static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
45 unsigned int num_points)
47 const unsigned int avx_iters = num_points / 8;
49 float* inputVectorPtr = (
float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
63 for (
i = 0;
i < avx_iters;
i++) {
64 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
66 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
84 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
85 aux = *inputVectorPtr++;
88 else if (aux < min_val)
90 *outputVectorPtr++ = (int16_t)
rintf(aux);
100 unsigned int num_points)
102 const unsigned int sse_iters = num_points / 4;
104 float* inputVectorPtr = (
float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
111 __m128 inputVal1, inputVal2;
112 __m128i intInputVal1, intInputVal2;
114 const __m128 vmin_val = _mm_set_ps1(min_val);
115 const __m128 vmax_val = _mm_set_ps1(max_val);
118 for (
i = 0;
i < sse_iters;
i++) {
119 inputVal1 = _mm_load_ps((
float*)inputVectorPtr);
121 inputVal2 = _mm_load_ps((
float*)inputVectorPtr);
126 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
127 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
129 intInputVal1 = _mm_cvtps_epi32(ret1);
130 intInputVal2 = _mm_cvtps_epi32(ret2);
132 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
134 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
135 outputVectorPtr += 8;
138 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
139 aux = *inputVectorPtr++;
142 else if (aux < min_val)
144 *outputVectorPtr++ = (int16_t)
rintf(aux);
153static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
155 unsigned int num_points)
158 const unsigned int neon_iters = num_points / 4;
160 float32_t* inputVectorPtr = (float32_t*)inputVector;
161 int16_t* outputVectorPtr = (int16_t*)outputVector;
163 const float min_val_f = (float)SHRT_MIN;
164 const float max_val_f = (float)SHRT_MAX;
168 const float32x4_t min_val = vmovq_n_f32(min_val_f);
169 const float32x4_t max_val = vmovq_n_f32(max_val_f);
170 float32x4_t half = vdupq_n_f32(0.5f);
171 float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
173 int32x4_t toint_a = { 0, 0, 0, 0 };
174 int32x4_t toint_b = { 0, 0, 0, 0 };
175 int16x4_t intInputVal1, intInputVal2;
178 for (
i = 0;
i < neon_iters;
i++) {
179 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
181 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
185 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
186 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
188 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
189 PlusHalf = vaddq_f32(ret1, half);
190 Round = vsubq_f32(PlusHalf, sign);
191 toint_a = vcvtq_s32_f32(Round);
193 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
194 PlusHalf = vaddq_f32(ret2, half);
195 Round = vsubq_f32(PlusHalf, sign);
196 toint_b = vcvtq_s32_f32(Round);
198 intInputVal1 = vqmovn_s32(toint_a);
199 intInputVal2 = vqmovn_s32(toint_b);
201 res = vcombine_s16(intInputVal1, intInputVal2);
202 vst1q_s16((int16_t*)outputVectorPtr, res);
203 outputVectorPtr += 8;
206 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
207 aux = *inputVectorPtr++;
210 else if (aux < min_val_f)
212 *outputVectorPtr++ = (int16_t)
rintf(aux);
221static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
223 unsigned int num_points)
225 const unsigned int neon_iters = num_points / 4;
227 float32_t* inputVectorPtr = (float32_t*)inputVector;
228 int16_t* outputVectorPtr = (int16_t*)outputVector;
230 const float min_val_f = (float)SHRT_MIN;
231 const float max_val_f = (float)SHRT_MAX;
235 const float32x4_t min_val = vmovq_n_f32(min_val_f);
236 const float32x4_t max_val = vmovq_n_f32(max_val_f);
237 float32x4_t ret1, ret2, a, b;
239 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
240 int16x4_t intInputVal1, intInputVal2;
243 for (
i = 0;
i < neon_iters;
i++) {
244 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
246 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
250 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
251 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
254 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
255 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
257 intInputVal1 = vqmovn_s32(toint_a);
258 intInputVal2 = vqmovn_s32(toint_b);
260 res = vcombine_s16(intInputVal1, intInputVal2);
261 vst1q_s16((int16_t*)outputVectorPtr, res);
262 outputVectorPtr += 8;
265 for (
i = neon_iters * 8;
i < num_points * 2;
i++) {
266 aux = *inputVectorPtr++;
269 else if (aux < min_val_f)
271 *outputVectorPtr++ = (int16_t)
rintf(aux);
277#ifdef LV_HAVE_GENERIC
281 unsigned int num_points)
283 float* inputVectorPtr = (
float*)inputVector;
284 int16_t* outputVectorPtr = (int16_t*)outputVector;
285 const float min_val = (float)SHRT_MIN;
286 const float max_val = (float)SHRT_MAX;
289 for (
i = 0;
i < num_points * 2;
i++) {
290 aux = *inputVectorPtr++;
293 else if (aux < min_val)
295 *outputVectorPtr++ = (int16_t)
rintf(aux);
302#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
303#define INCLUDED_volk_32fc_convert_16ic_u_H
311#include <immintrin.h>
313static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
315 unsigned int num_points)
317 const unsigned int avx_iters = num_points / 8;
319 float* inputVectorPtr = (
float*)inputVector;
320 int16_t* outputVectorPtr = (int16_t*)outputVector;
323 const float min_val = (float)SHRT_MIN;
324 const float max_val = (float)SHRT_MAX;
326 __m256 inputVal1, inputVal2;
327 __m256i intInputVal1, intInputVal2;
329 const __m256 vmin_val = _mm256_set1_ps(min_val);
330 const __m256 vmax_val = _mm256_set1_ps(max_val);
333 for (
i = 0;
i < avx_iters;
i++) {
334 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
336 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
341 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
342 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
344 intInputVal1 = _mm256_cvtps_epi32(ret1);
345 intInputVal2 = _mm256_cvtps_epi32(ret2);
347 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
348 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
350 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
351 outputVectorPtr += 16;
354 for (
i = avx_iters * 16;
i < num_points * 2;
i++) {
355 aux = *inputVectorPtr++;
358 else if (aux < min_val)
360 *outputVectorPtr++ = (int16_t)
rintf(aux);
367#include <emmintrin.h>
371 unsigned int num_points)
373 const unsigned int sse_iters = num_points / 4;
375 float* inputVectorPtr = (
float*)inputVector;
376 int16_t* outputVectorPtr = (int16_t*)outputVector;
379 const float min_val = (float)SHRT_MIN;
380 const float max_val = (float)SHRT_MAX;
382 __m128 inputVal1, inputVal2;
383 __m128i intInputVal1, intInputVal2;
385 const __m128 vmin_val = _mm_set_ps1(min_val);
386 const __m128 vmax_val = _mm_set_ps1(max_val);
389 for (
i = 0;
i < sse_iters;
i++) {
390 inputVal1 = _mm_loadu_ps((
float*)inputVectorPtr);
392 inputVal2 = _mm_loadu_ps((
float*)inputVectorPtr);
397 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
398 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
400 intInputVal1 = _mm_cvtps_epi32(ret1);
401 intInputVal2 = _mm_cvtps_epi32(ret2);
403 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
405 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
406 outputVectorPtr += 8;
409 for (
i = sse_iters * 8;
i < num_points * 2;
i++) {
410 aux = *inputVectorPtr++;
413 else if (aux < min_val)
415 *outputVectorPtr++ = (int16_t)
rintf(aux);
421#include <riscv_vector.h>
423static inline void volk_32fc_convert_16ic_rvv(
lv_16sc_t* outputVector,
425 unsigned int num_points)
427 int16_t* out = (int16_t*)outputVector;
428 float* in = (
float*)inputVector;
429 size_t n = num_points * 2;
430 for (
size_t vl; n > 0; n -= vl, in += vl, out += vl) {
431 vl = __riscv_vsetvl_e32m8(n);
432 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
433 __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);
static float rintf(float x)
Definition config.h:45
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:98
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:369
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition volk_32fc_convert_16ic.h:279
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
float complex lv_32fc_t
Definition volk_complex.h:74
short complex lv_16sc_t
Definition volk_complex.h:71
for i
Definition volk_config_fixed.tmpl.h:13