41#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42#define INCLUDED_volk_8i_s32f_convert_32f_u_H
50static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int8_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
66 for (; number < sixteenthPoints; number++) {
67 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
75 inputVal128 = _mm_srli_si128(inputVal128, 8);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
96static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
97 const int8_t* inputVector,
99 unsigned int num_points)
101 unsigned int number = 0;
102 const unsigned int sixteenthPoints = num_points / 16;
104 float* outputVectorPtr = outputVector;
105 const float iScalar = 1.0 / scalar;
106 __m128 invScalar = _mm_set_ps1(iScalar);
107 const int8_t* inputVectorPtr = inputVector;
112 for (; number < sixteenthPoints; number++) {
113 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
115 interimVal = _mm_cvtepi8_epi32(inputVal);
116 ret = _mm_cvtepi32_ps(interimVal);
117 ret = _mm_mul_ps(ret, invScalar);
118 _mm_storeu_ps(outputVectorPtr, ret);
119 outputVectorPtr += 4;
121 inputVal = _mm_srli_si128(inputVal, 4);
122 interimVal = _mm_cvtepi8_epi32(inputVal);
123 ret = _mm_cvtepi32_ps(interimVal);
124 ret = _mm_mul_ps(ret, invScalar);
125 _mm_storeu_ps(outputVectorPtr, ret);
126 outputVectorPtr += 4;
128 inputVal = _mm_srli_si128(inputVal, 4);
129 interimVal = _mm_cvtepi8_epi32(inputVal);
130 ret = _mm_cvtepi32_ps(interimVal);
131 ret = _mm_mul_ps(ret, invScalar);
132 _mm_storeu_ps(outputVectorPtr, ret);
133 outputVectorPtr += 4;
135 inputVal = _mm_srli_si128(inputVal, 4);
136 interimVal = _mm_cvtepi8_epi32(inputVal);
137 ret = _mm_cvtepi32_ps(interimVal);
138 ret = _mm_mul_ps(ret, invScalar);
139 _mm_storeu_ps(outputVectorPtr, ret);
140 outputVectorPtr += 4;
142 inputVectorPtr += 16;
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 outputVector[number] = (float)(inputVector[number]) * iScalar;
152#ifdef LV_HAVE_GENERIC
155 const int8_t* inputVector,
157 unsigned int num_points)
159 float* outputVectorPtr = outputVector;
160 const int8_t* inputVectorPtr = inputVector;
161 unsigned int number = 0;
162 const float iScalar = 1.0 / scalar;
164 for (number = 0; number < num_points; number++) {
165 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
173#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174#define INCLUDED_volk_8i_s32f_convert_32f_a_H
180#include <immintrin.h>
182static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
183 const int8_t* inputVector,
185 unsigned int num_points)
187 unsigned int number = 0;
188 const unsigned int sixteenthPoints = num_points / 16;
190 float* outputVectorPtr = outputVector;
191 const float iScalar = 1.0 / scalar;
192 __m256 invScalar = _mm256_set1_ps(iScalar);
193 const int8_t* inputVectorPtr = inputVector;
198 for (; number < sixteenthPoints; number++) {
199 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
201 interimVal = _mm256_cvtepi8_epi32(inputVal128);
202 ret = _mm256_cvtepi32_ps(interimVal);
203 ret = _mm256_mul_ps(ret, invScalar);
204 _mm256_store_ps(outputVectorPtr, ret);
205 outputVectorPtr += 8;
207 inputVal128 = _mm_srli_si128(inputVal128, 8);
208 interimVal = _mm256_cvtepi8_epi32(inputVal128);
209 ret = _mm256_cvtepi32_ps(interimVal);
210 ret = _mm256_mul_ps(ret, invScalar);
211 _mm256_store_ps(outputVectorPtr, ret);
212 outputVectorPtr += 8;
214 inputVectorPtr += 16;
217 number = sixteenthPoints * 16;
218 for (; number < num_points; number++) {
219 outputVector[number] = (float)(inputVector[number]) * iScalar;
225#include <smmintrin.h>
227static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
228 const int8_t* inputVector,
230 unsigned int num_points)
232 unsigned int number = 0;
233 const unsigned int sixteenthPoints = num_points / 16;
235 float* outputVectorPtr = outputVector;
236 const float iScalar = 1.0 / scalar;
237 __m128 invScalar = _mm_set_ps1(iScalar);
238 const int8_t* inputVectorPtr = inputVector;
243 for (; number < sixteenthPoints; number++) {
244 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
246 interimVal = _mm_cvtepi8_epi32(inputVal);
247 ret = _mm_cvtepi32_ps(interimVal);
248 ret = _mm_mul_ps(ret, invScalar);
249 _mm_store_ps(outputVectorPtr, ret);
250 outputVectorPtr += 4;
252 inputVal = _mm_srli_si128(inputVal, 4);
253 interimVal = _mm_cvtepi8_epi32(inputVal);
254 ret = _mm_cvtepi32_ps(interimVal);
255 ret = _mm_mul_ps(ret, invScalar);
256 _mm_store_ps(outputVectorPtr, ret);
257 outputVectorPtr += 4;
259 inputVal = _mm_srli_si128(inputVal, 4);
260 interimVal = _mm_cvtepi8_epi32(inputVal);
261 ret = _mm_cvtepi32_ps(interimVal);
262 ret = _mm_mul_ps(ret, invScalar);
263 _mm_store_ps(outputVectorPtr, ret);
264 outputVectorPtr += 4;
266 inputVal = _mm_srli_si128(inputVal, 4);
267 interimVal = _mm_cvtepi8_epi32(inputVal);
268 ret = _mm_cvtepi32_ps(interimVal);
269 ret = _mm_mul_ps(ret, invScalar);
270 _mm_store_ps(outputVectorPtr, ret);
271 outputVectorPtr += 4;
273 inputVectorPtr += 16;
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (float)(inputVector[number]) * iScalar;
287 const int8_t* inputVector,
289 unsigned int num_points)
291 float* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
294 const float iScalar = 1.0 / scalar;
295 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
302 float32x4_t outputFloat;
304 unsigned int number = 0;
305 const unsigned int sixteenthPoints = num_points / 16;
306 for (; number < sixteenthPoints; number++) {
307 inputVal = vld1q_s8(inputVectorPtr);
308 inputVectorPtr += 16;
310 lower = vmovl_s8(vget_low_s8(inputVal));
311 higher = vmovl_s8(vget_high_s8(inputVal));
313 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314 vst1q_f32(outputVectorPtr, outputFloat);
315 outputVectorPtr += 4;
317 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318 vst1q_f32(outputVectorPtr, outputFloat);
319 outputVectorPtr += 4;
321 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322 vst1q_f32(outputVectorPtr, outputFloat);
323 outputVectorPtr += 4;
326 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
330 for (number = sixteenthPoints * 16; number < num_points; number++) {
331 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
338extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
339 const int8_t* inputVector,
343static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
344 const int8_t* inputVector,
346 unsigned int num_points)
348 float invscalar = 1.0 / scalar;
349 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
354#include <riscv_vector.h>
356static inline void volk_8i_s32f_convert_32f_rvv(
float* outputVector,
357 const int8_t* inputVector,
359 unsigned int num_points)
361 size_t n = num_points;
362 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
363 vl = __riscv_vsetvl_e8m2(n);
364 vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl);
366 outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl);
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition volk_8i_s32f_convert_32f.h:154
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition volk_8i_s32f_convert_32f.h:286