Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
50 int16_t* qBuffer,
51 const lv_16sc_t* complexVector,
52 unsigned int num_points)
53{
54 unsigned int number = 0;
55 const int8_t* complexVectorPtr = (int8_t*)complexVector;
56 int16_t* iBufferPtr = iBuffer;
57 int16_t* qBufferPtr = qBuffer;
58
59 __m256i MoveMask = _mm256_set_epi8(15,
60 14,
61 11,
62 10,
63 7,
64 6,
65 3,
66 2,
67 13,
68 12,
69 9,
70 8,
71 5,
72 4,
73 1,
74 0,
75 15,
76 14,
77 11,
78 10,
79 7,
80 6,
81 3,
82 2,
83 13,
84 12,
85 9,
86 8,
87 5,
88 4,
89 1,
90 0);
91
92 __m256i iMove2, iMove1;
93 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
94
95 unsigned int sixteenthPoints = num_points / 16;
96
97 for (number = 0; number < sixteenthPoints; number++) {
98 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99 complexVectorPtr += 32;
100 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101 complexVectorPtr += 32;
102
103 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
105
106 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107 _mm256_permute4x64_epi64(iMove2, 0x80),
108 0x30);
109 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110 _mm256_permute4x64_epi64(iMove2, 0xd0),
111 0x30);
112
113 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
115
116 iBufferPtr += 16;
117 qBufferPtr += 16;
118 }
119
120 number = sixteenthPoints * 16;
121 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122 for (; number < num_points; number++) {
123 *iBufferPtr++ = *int16ComplexVectorPtr++;
124 *qBufferPtr++ = *int16ComplexVectorPtr++;
125 }
126}
127#endif /* LV_HAVE_AVX2 */
128
129#ifdef LV_HAVE_SSSE3
130#include <tmmintrin.h>
131
132static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
133 int16_t* qBuffer,
134 const lv_16sc_t* complexVector,
135 unsigned int num_points)
136{
137 unsigned int number = 0;
138 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 int16_t* iBufferPtr = iBuffer;
140 int16_t* qBufferPtr = qBuffer;
141
142 __m128i iMoveMask1 = _mm_set_epi8(
143 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144 __m128i iMoveMask2 = _mm_set_epi8(
145 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
146
147 __m128i qMoveMask1 = _mm_set_epi8(
148 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149 __m128i qMoveMask2 = _mm_set_epi8(
150 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
151
152 __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
153
154 unsigned int eighthPoints = num_points / 8;
155
156 for (number = 0; number < eighthPoints; number++) {
157 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158 complexVectorPtr += 16;
159 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160 complexVectorPtr += 16;
161
162 iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163 _mm_shuffle_epi8(complexVal2, iMoveMask2));
164 qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165 _mm_shuffle_epi8(complexVal2, qMoveMask2));
166
167 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
169
170 iBufferPtr += 8;
171 qBufferPtr += 8;
172 }
173
174 number = eighthPoints * 8;
175 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176 for (; number < num_points; number++) {
177 *iBufferPtr++ = *int16ComplexVectorPtr++;
178 *qBufferPtr++ = *int16ComplexVectorPtr++;
179 }
180}
181#endif /* LV_HAVE_SSSE3 */
182
183#ifdef LV_HAVE_SSE2
184#include <emmintrin.h>
185
186static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
187 int16_t* qBuffer,
188 const lv_16sc_t* complexVector,
189 unsigned int num_points)
190{
191 unsigned int number = 0;
192 const int16_t* complexVectorPtr = (int16_t*)complexVector;
193 int16_t* iBufferPtr = iBuffer;
194 int16_t* qBufferPtr = qBuffer;
195 __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196 qComplexVal2, iOutputVal, qOutputVal;
197 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
199
200 unsigned int eighthPoints = num_points / 8;
201
202 for (number = 0; number < eighthPoints; number++) {
203 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204 complexVectorPtr += 8;
205 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206 complexVectorPtr += 8;
207
208 iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
209
210 iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
211
212 iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
213
214 iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
215
216 iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
217
218 iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
219
220 iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221 _mm_and_si128(iComplexVal2, highMask));
222
223 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
224
225 qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
226
227 qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
228
229 qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
230
231 qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232
233 qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
234
235 qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
236
237 qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238 _mm_and_si128(qComplexVal2, highMask));
239
240 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
241
242 iBufferPtr += 8;
243 qBufferPtr += 8;
244 }
245
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *iBufferPtr++ = *complexVectorPtr++;
249 *qBufferPtr++ = *complexVectorPtr++;
250 }
251}
252#endif /* LV_HAVE_SSE2 */
253
254#ifdef LV_HAVE_GENERIC
255
256static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
257 int16_t* qBuffer,
258 const lv_16sc_t* complexVector,
259 unsigned int num_points)
260{
261 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
262 int16_t* iBufferPtr = iBuffer;
263 int16_t* qBufferPtr = qBuffer;
264 unsigned int number;
265 for (number = 0; number < num_points; number++) {
266 *iBufferPtr++ = *complexVectorPtr++;
267 *qBufferPtr++ = *complexVectorPtr++;
268 }
269}
270#endif /* LV_HAVE_GENERIC */
271
272#ifdef LV_HAVE_ORC
273
274extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
275 int16_t* qBuffer,
276 const lv_16sc_t* complexVector,
277 int num_points);
278static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
279 int16_t* qBuffer,
280 const lv_16sc_t* complexVector,
281 unsigned int num_points)
282{
283 volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
284}
285#endif /* LV_HAVE_ORC */
286
287#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
288
289
290#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
291#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
292
293#include <inttypes.h>
294#include <stdio.h>
295#ifdef LV_HAVE_AVX2
296#include <immintrin.h>
297
298static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
299 int16_t* qBuffer,
300 const lv_16sc_t* complexVector,
301 unsigned int num_points)
302{
303 unsigned int number = 0;
304 const int8_t* complexVectorPtr = (int8_t*)complexVector;
305 int16_t* iBufferPtr = iBuffer;
306 int16_t* qBufferPtr = qBuffer;
307
308 __m256i MoveMask = _mm256_set_epi8(15,
309 14,
310 11,
311 10,
312 7,
313 6,
314 3,
315 2,
316 13,
317 12,
318 9,
319 8,
320 5,
321 4,
322 1,
323 0,
324 15,
325 14,
326 11,
327 10,
328 7,
329 6,
330 3,
331 2,
332 13,
333 12,
334 9,
335 8,
336 5,
337 4,
338 1,
339 0);
340
341 __m256i iMove2, iMove1;
342 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
343
344 unsigned int sixteenthPoints = num_points / 16;
345
346 for (number = 0; number < sixteenthPoints; number++) {
347 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
348 complexVectorPtr += 32;
349 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
350 complexVectorPtr += 32;
351
352 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
353 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
354
355 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
356 _mm256_permute4x64_epi64(iMove2, 0x80),
357 0x30);
358 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
359 _mm256_permute4x64_epi64(iMove2, 0xd0),
360 0x30);
361
362 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
363 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
364
365 iBufferPtr += 16;
366 qBufferPtr += 16;
367 }
368
369 number = sixteenthPoints * 16;
370 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
371 for (; number < num_points; number++) {
372 *iBufferPtr++ = *int16ComplexVectorPtr++;
373 *qBufferPtr++ = *int16ComplexVectorPtr++;
374 }
375}
376#endif /* LV_HAVE_AVX2 */
377
378#ifdef LV_HAVE_RVV
379#include <riscv_vector.h>
380
381static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
382 int16_t* qBuffer,
383 const lv_16sc_t* complexVector,
384 unsigned int num_points)
385{
386 size_t n = num_points;
387 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
388 vl = __riscv_vsetvl_e16m4(n);
389 vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl);
390 vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
391 vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
392 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
393 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
394 }
395}
396#endif /*LV_HAVE_RVV*/
397
398#ifdef LV_HAVE_RVVSEG
399#include <riscv_vector.h>
400
401static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
402 int16_t* qBuffer,
403 const lv_16sc_t* complexVector,
404 unsigned int num_points)
405{
406 size_t n = num_points;
407 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
408 vl = __riscv_vsetvl_e16m4(n);
409 vuint16m4x2_t vc =
410 __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl);
411 vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
412 vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
413 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
414 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
415 }
416}
417#endif /*LV_HAVE_RVVSEG*/
418
419#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:256
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:186
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:132
short complex lv_16sc_t
Definition volk_complex.h:71