Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44
45#ifndef INCLUDED_volk_16ic_convert_32fc_a_H
46#define INCLUDED_volk_16ic_convert_32fc_a_H
47
48#include <volk/volk_complex.h>
49
50#ifdef LV_HAVE_AVX2
51#include <immintrin.h>
52
53static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
54 const lv_16sc_t* inputVector,
55 unsigned int num_points)
56{
57 const unsigned int avx_iters = num_points / 8;
58 unsigned int number = 0;
59 const int16_t* complexVectorPtr = (int16_t*)inputVector;
60 float* outputVectorPtr = (float*)outputVector;
61 __m256 outVal;
62 __m256i outValInt;
63 __m128i cplxValue;
64
65 for (number = 0; number < avx_iters; number++) {
66 cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
67 complexVectorPtr += 8;
68
69 outValInt = _mm256_cvtepi16_epi32(cplxValue);
70 outVal = _mm256_cvtepi32_ps(outValInt);
71 _mm256_store_ps((float*)outputVectorPtr, outVal);
72
73 outputVectorPtr += 8;
74 }
75
76 number = avx_iters * 8;
77 for (; number < num_points * 2; number++) {
78 *outputVectorPtr++ = (float)*complexVectorPtr++;
79 }
80}
81
82#endif /* LV_HAVE_AVX2 */
83
84#ifdef LV_HAVE_GENERIC
85
86static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
87 const lv_16sc_t* inputVector,
88 unsigned int num_points)
89{
90 unsigned int i;
91 for (i = 0; i < num_points; i++) {
92 outputVector[i] =
93 lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
94 }
95}
96
97#endif /* LV_HAVE_GENERIC */
98
99
100#ifdef LV_HAVE_SSE2
101#include <emmintrin.h>
102
103static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
104 const lv_16sc_t* inputVector,
105 unsigned int num_points)
106{
107 const unsigned int sse_iters = num_points / 2;
108
109 const lv_16sc_t* _in = inputVector;
110 lv_32fc_t* _out = outputVector;
111 __m128 a;
112 unsigned int number;
113
114 for (number = 0; number < sse_iters; number++) {
115 a = _mm_set_ps(
116 (float)(lv_cimag(_in[1])),
117 (float)(lv_creal(_in[1])),
118 (float)(lv_cimag(_in[0])),
119 (float)(lv_creal(
120 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
121 _mm_store_ps((float*)_out, a);
122 _in += 2;
123 _out += 2;
124 }
125 if (num_points & 1) {
126 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
127 _in++;
128 }
129}
130
131#endif /* LV_HAVE_SSE2 */
132
133#ifdef LV_HAVE_AVX
134#include <immintrin.h>
135
136static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
137 const lv_16sc_t* inputVector,
138 unsigned int num_points)
139{
140 const unsigned int sse_iters = num_points / 4;
141
142 const lv_16sc_t* _in = inputVector;
143 lv_32fc_t* _out = outputVector;
144 __m256 a;
145 unsigned int i, number;
146
147 for (number = 0; number < sse_iters; number++) {
148 a = _mm256_set_ps(
149 (float)(lv_cimag(_in[3])),
150 (float)(lv_creal(_in[3])),
151 (float)(lv_cimag(_in[2])),
152 (float)(lv_creal(_in[2])),
153 (float)(lv_cimag(_in[1])),
154 (float)(lv_creal(_in[1])),
155 (float)(lv_cimag(_in[0])),
156 (float)(lv_creal(
157 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
158 _mm256_store_ps((float*)_out, a);
159 _in += 4;
160 _out += 4;
161 }
162
163 for (i = 0; i < (num_points % 4); ++i) {
164 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
165 _in++;
166 }
167}
168
169#endif /* LV_HAVE_AVX */
170
171
172#ifdef LV_HAVE_NEON
173#include <arm_neon.h>
174
175static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
176 const lv_16sc_t* inputVector,
177 unsigned int num_points)
178{
179 const unsigned int sse_iters = num_points / 2;
180
181 const lv_16sc_t* _in = inputVector;
182 lv_32fc_t* _out = outputVector;
183
184 int16x4_t a16x4;
185 int32x4_t a32x4;
186 float32x4_t f32x4;
187 unsigned int i, number;
188
189 for (number = 0; number < sse_iters; number++) {
190 a16x4 = vld1_s16((const int16_t*)_in);
191 __VOLK_PREFETCH(_in + 4);
192 a32x4 = vmovl_s16(a16x4);
193 f32x4 = vcvtq_f32_s32(a32x4);
194 vst1q_f32((float32_t*)_out, f32x4);
195 _in += 2;
196 _out += 2;
197 }
198 for (i = 0; i < (num_points % 2); ++i) {
199 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
200 _in++;
201 }
202}
203#endif /* LV_HAVE_NEON */
204
205#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
206
207#ifndef INCLUDED_volk_16ic_convert_32fc_u_H
208#define INCLUDED_volk_16ic_convert_32fc_u_H
209
210#include <volk/volk_complex.h>
211
212
213#ifdef LV_HAVE_AVX2
214#include <immintrin.h>
215
216static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
217 const lv_16sc_t* inputVector,
218 unsigned int num_points)
219{
220 const unsigned int avx_iters = num_points / 8;
221 unsigned int number = 0;
222 const int16_t* complexVectorPtr = (int16_t*)inputVector;
223 float* outputVectorPtr = (float*)outputVector;
224 __m256 outVal;
225 __m256i outValInt;
226 __m128i cplxValue;
227
228 for (number = 0; number < avx_iters; number++) {
229 cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
230 complexVectorPtr += 8;
231
232 outValInt = _mm256_cvtepi16_epi32(cplxValue);
233 outVal = _mm256_cvtepi32_ps(outValInt);
234 _mm256_storeu_ps((float*)outputVectorPtr, outVal);
235
236 outputVectorPtr += 8;
237 }
238
239 number = avx_iters * 8;
240 for (; number < num_points * 2; number++) {
241 *outputVectorPtr++ = (float)*complexVectorPtr++;
242 }
243}
244
245#endif /* LV_HAVE_AVX2 */
246
247#ifdef LV_HAVE_SSE2
248#include <emmintrin.h>
249
250static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
251 const lv_16sc_t* inputVector,
252 unsigned int num_points)
253{
254 const unsigned int sse_iters = num_points / 2;
255
256 const lv_16sc_t* _in = inputVector;
257 lv_32fc_t* _out = outputVector;
258 __m128 a;
259 unsigned int number;
260
261 for (number = 0; number < sse_iters; number++) {
262 a = _mm_set_ps(
263 (float)(lv_cimag(_in[1])),
264 (float)(lv_creal(_in[1])),
265 (float)(lv_cimag(_in[0])),
266 (float)(lv_creal(
267 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
268 _mm_storeu_ps((float*)_out, a);
269 _in += 2;
270 _out += 2;
271 }
272 if (num_points & 1) {
273 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
274 _in++;
275 }
276}
277
278#endif /* LV_HAVE_SSE2 */
279
280
281#ifdef LV_HAVE_AVX
282#include <immintrin.h>
283
284static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
285 const lv_16sc_t* inputVector,
286 unsigned int num_points)
287{
288 const unsigned int sse_iters = num_points / 4;
289
290 const lv_16sc_t* _in = inputVector;
291 lv_32fc_t* _out = outputVector;
292 __m256 a;
293 unsigned int i, number;
294
295 for (number = 0; number < sse_iters; number++) {
296 a = _mm256_set_ps(
297 (float)(lv_cimag(_in[3])),
298 (float)(lv_creal(_in[3])),
299 (float)(lv_cimag(_in[2])),
300 (float)(lv_creal(_in[2])),
301 (float)(lv_cimag(_in[1])),
302 (float)(lv_creal(_in[1])),
303 (float)(lv_cimag(_in[0])),
304 (float)(lv_creal(
305 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
306 _mm256_storeu_ps((float*)_out, a);
307 _in += 4;
308 _out += 4;
309 }
310
311 for (i = 0; i < (num_points % 4); ++i) {
312 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
313 _in++;
314 }
315}
316
317#endif /* LV_HAVE_AVX */
318
319#ifdef LV_HAVE_RVV
320#include <riscv_vector.h>
321
322static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector,
323 const lv_16sc_t* inputVector,
324 unsigned int num_points)
325{
326 const int16_t* in = (const int16_t*)inputVector;
327 float* out = (float*)outputVector;
328 size_t n = num_points * 2;
329 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
330 vl = __riscv_vsetvl_e16m4(n);
331 vint16m4_t v = __riscv_vle16_v_i16m4(in, vl);
332 __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl);
333 }
334}
335#endif /*LV_HAVE_RVV*/
336
337#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:86
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:284
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:136
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:250
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:175
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition volk_16ic_convert_32fc.h:103
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
short complex lv_16sc_t
Definition volk_complex.h:71
for i
Definition volk_config_fixed.tmpl.h:13