Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
61
62#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
64
65#include <inttypes.h>
66#include <stdio.h>
67#include <volk/volk_common.h>
68
69#ifdef LV_HAVE_AVX2
70#include <immintrin.h>
71
72static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
73 const float* iBuffer,
74 const float* qBuffer,
75 const float scalar,
76 unsigned int num_points)
77{
78 unsigned int number = 0;
79 const float* iBufferPtr = iBuffer;
80 const float* qBufferPtr = qBuffer;
81
82 __m256 vScalar = _mm256_set1_ps(scalar);
83
84 const unsigned int eighthPoints = num_points / 8;
85
86 __m256 iValue, qValue, cplxValue1, cplxValue2;
87 __m256i intValue1, intValue2;
88
89 int16_t* complexVectorPtr = (int16_t*)complexVector;
90
91 for (; number < eighthPoints; number++) {
92 iValue = _mm256_load_ps(iBufferPtr);
93 qValue = _mm256_load_ps(qBufferPtr);
94
95 // Interleaves the lower two values in the i and q variables into one buffer
96 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
98
99 // Interleaves the upper two values in the i and q variables into one buffer
100 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
102
103 intValue1 = _mm256_cvtps_epi32(cplxValue1);
104 intValue2 = _mm256_cvtps_epi32(cplxValue2);
105
106 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
107
108 _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109 complexVectorPtr += 16;
110
111 iBufferPtr += 8;
112 qBufferPtr += 8;
113 }
114
115 number = eighthPoints * 8;
116 complexVectorPtr = (int16_t*)(&complexVector[number]);
117 for (; number < num_points; number++) {
118 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
120 }
121}
122#endif /* LV_HAVE_AVX2 */
123
124
125#ifdef LV_HAVE_SSE2
126#include <emmintrin.h>
127
128static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
129 const float* iBuffer,
130 const float* qBuffer,
131 const float scalar,
132 unsigned int num_points)
133{
134 unsigned int number = 0;
135 const float* iBufferPtr = iBuffer;
136 const float* qBufferPtr = qBuffer;
137
138 __m128 vScalar = _mm_set_ps1(scalar);
139
140 const unsigned int quarterPoints = num_points / 4;
141
142 __m128 iValue, qValue, cplxValue1, cplxValue2;
143 __m128i intValue1, intValue2;
144
145 int16_t* complexVectorPtr = (int16_t*)complexVector;
146
147 for (; number < quarterPoints; number++) {
148 iValue = _mm_load_ps(iBufferPtr);
149 qValue = _mm_load_ps(qBufferPtr);
150
151 // Interleaves the lower two values in the i and q variables into one buffer
152 cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153 cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
154
155 // Interleaves the upper two values in the i and q variables into one buffer
156 cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157 cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
158
159 intValue1 = _mm_cvtps_epi32(cplxValue1);
160 intValue2 = _mm_cvtps_epi32(cplxValue2);
161
162 intValue1 = _mm_packs_epi32(intValue1, intValue2);
163
164 _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165 complexVectorPtr += 8;
166
167 iBufferPtr += 4;
168 qBufferPtr += 4;
169 }
170
171 number = quarterPoints * 4;
172 complexVectorPtr = (int16_t*)(&complexVector[number]);
173 for (; number < num_points; number++) {
174 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
176 }
177}
178#endif /* LV_HAVE_SSE2 */
179
180
181#ifdef LV_HAVE_SSE
182#include <xmmintrin.h>
183
184static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
185 const float* iBuffer,
186 const float* qBuffer,
187 const float scalar,
188 unsigned int num_points)
189{
190 unsigned int number = 0;
191 const float* iBufferPtr = iBuffer;
192 const float* qBufferPtr = qBuffer;
193
194 __m128 vScalar = _mm_set_ps1(scalar);
195
196 const unsigned int quarterPoints = num_points / 4;
197
198 __m128 iValue, qValue, cplxValue;
199
200 int16_t* complexVectorPtr = (int16_t*)complexVector;
201
202 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
203
204 for (; number < quarterPoints; number++) {
205 iValue = _mm_load_ps(iBufferPtr);
206 qValue = _mm_load_ps(qBufferPtr);
207
208 // Interleaves the lower two values in the i and q variables into one buffer
209 cplxValue = _mm_unpacklo_ps(iValue, qValue);
210 cplxValue = _mm_mul_ps(cplxValue, vScalar);
211
212 _mm_store_ps(floatBuffer, cplxValue);
213
214 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
218
219 // Interleaves the upper two values in the i and q variables into one buffer
220 cplxValue = _mm_unpackhi_ps(iValue, qValue);
221 cplxValue = _mm_mul_ps(cplxValue, vScalar);
222
223 _mm_store_ps(floatBuffer, cplxValue);
224
225 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
229
230 iBufferPtr += 4;
231 qBufferPtr += 4;
232 }
233
234 number = quarterPoints * 4;
235 complexVectorPtr = (int16_t*)(&complexVector[number]);
236 for (; number < num_points; number++) {
237 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
239 }
240}
241#endif /* LV_HAVE_SSE */
242
243
244#ifdef LV_HAVE_GENERIC
245
246static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
247 const float* iBuffer,
248 const float* qBuffer,
249 const float scalar,
250 unsigned int num_points)
251{
252 int16_t* complexVectorPtr = (int16_t*)complexVector;
253 const float* iBufferPtr = iBuffer;
254 const float* qBufferPtr = qBuffer;
255 unsigned int number = 0;
256
257 for (number = 0; number < num_points; number++) {
258 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
260 }
261}
262#endif /* LV_HAVE_GENERIC */
263
264
265#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
266
267#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
269
270#include <inttypes.h>
271#include <stdio.h>
272#include <volk/volk_common.h>
273
274#ifdef LV_HAVE_AVX2
275#include <immintrin.h>
276
277static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
278 const float* iBuffer,
279 const float* qBuffer,
280 const float scalar,
281 unsigned int num_points)
282{
283 unsigned int number = 0;
284 const float* iBufferPtr = iBuffer;
285 const float* qBufferPtr = qBuffer;
286
287 __m256 vScalar = _mm256_set1_ps(scalar);
288
289 const unsigned int eighthPoints = num_points / 8;
290
291 __m256 iValue, qValue, cplxValue1, cplxValue2;
292 __m256i intValue1, intValue2;
293
294 int16_t* complexVectorPtr = (int16_t*)complexVector;
295
296 for (; number < eighthPoints; number++) {
297 iValue = _mm256_loadu_ps(iBufferPtr);
298 qValue = _mm256_loadu_ps(qBufferPtr);
299
300 // Interleaves the lower two values in the i and q variables into one buffer
301 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
303
304 // Interleaves the upper two values in the i and q variables into one buffer
305 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
307
308 intValue1 = _mm256_cvtps_epi32(cplxValue1);
309 intValue2 = _mm256_cvtps_epi32(cplxValue2);
310
311 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
312
313 _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314 complexVectorPtr += 16;
315
316 iBufferPtr += 8;
317 qBufferPtr += 8;
318 }
319
320 number = eighthPoints * 8;
321 complexVectorPtr = (int16_t*)(&complexVector[number]);
322 for (; number < num_points; number++) {
323 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
325 }
326}
327#endif /* LV_HAVE_AVX2 */
328
329#ifdef LV_HAVE_RVV
330#include <riscv_vector.h>
331
332static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector,
333 const float* iBuffer,
334 const float* qBuffer,
335 const float scalar,
336 unsigned int num_points)
337{
338 uint32_t* out = (uint32_t*)complexVector;
339 size_t n = num_points;
340 for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
341 vl = __riscv_vsetvl_e32m8(n);
342 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
343 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
344 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
345 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
346 vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
347 vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
348 vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
349 __riscv_vse32(out, vc, vl);
350 }
351}
352#endif /*LV_HAVE_RVV*/
353
354#ifdef LV_HAVE_RVVSEG
355#include <riscv_vector.h>
356
357static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector,
358 const float* iBuffer,
359 const float* qBuffer,
360 const float scalar,
361 unsigned int num_points)
362{
363 size_t n = num_points;
364 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
365 vl = __riscv_vsetvl_e32m8(n);
366 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
367 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
368 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
369 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
370 __riscv_vsseg2e16(
371 (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);
372 }
373}
374#endif /*LV_HAVE_RVVSEG*/
375
376#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
static float rintf(float x)
Definition config.h:45
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition volk_32f_x2_s32f_interleave_16ic.h:128
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition volk_32f_x2_s32f_interleave_16ic.h:184
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition volk_32f_x2_s32f_interleave_16ic.h:246
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
short complex lv_16sc_t
Definition volk_complex.h:71