41#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
49static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
52 unsigned int num_points)
54 unsigned int number = 0;
55 const int8_t* complexVectorPtr = (int8_t*)complexVector;
56 int16_t* iBufferPtr = iBuffer;
57 int16_t* qBufferPtr = qBuffer;
59 __m256i MoveMask = _mm256_set_epi8(15,
92 __m256i iMove2, iMove1;
93 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
95 unsigned int sixteenthPoints = num_points / 16;
97 for (number = 0; number < sixteenthPoints; number++) {
98 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99 complexVectorPtr += 32;
100 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101 complexVectorPtr += 32;
103 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
106 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107 _mm256_permute4x64_epi64(iMove2, 0x80),
109 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110 _mm256_permute4x64_epi64(iMove2, 0xd0),
113 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
120 number = sixteenthPoints * 16;
121 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122 for (; number < num_points; number++) {
123 *iBufferPtr++ = *int16ComplexVectorPtr++;
124 *qBufferPtr++ = *int16ComplexVectorPtr++;
130#include <tmmintrin.h>
135 unsigned int num_points)
137 unsigned int number = 0;
138 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 int16_t* iBufferPtr = iBuffer;
140 int16_t* qBufferPtr = qBuffer;
142 __m128i iMoveMask1 = _mm_set_epi8(
143 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144 __m128i iMoveMask2 = _mm_set_epi8(
145 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
147 __m128i qMoveMask1 = _mm_set_epi8(
148 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149 __m128i qMoveMask2 = _mm_set_epi8(
150 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
152 __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
154 unsigned int eighthPoints = num_points / 8;
156 for (number = 0; number < eighthPoints; number++) {
157 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158 complexVectorPtr += 16;
159 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160 complexVectorPtr += 16;
162 iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163 _mm_shuffle_epi8(complexVal2, iMoveMask2));
164 qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165 _mm_shuffle_epi8(complexVal2, qMoveMask2));
167 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
174 number = eighthPoints * 8;
175 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176 for (; number < num_points; number++) {
177 *iBufferPtr++ = *int16ComplexVectorPtr++;
178 *qBufferPtr++ = *int16ComplexVectorPtr++;
184#include <emmintrin.h>
189 unsigned int num_points)
191 unsigned int number = 0;
192 const int16_t* complexVectorPtr = (int16_t*)complexVector;
193 int16_t* iBufferPtr = iBuffer;
194 int16_t* qBufferPtr = qBuffer;
195 __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196 qComplexVal2, iOutputVal, qOutputVal;
197 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
200 unsigned int eighthPoints = num_points / 8;
202 for (number = 0; number < eighthPoints; number++) {
203 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204 complexVectorPtr += 8;
205 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206 complexVectorPtr += 8;
208 iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
210 iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
212 iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
214 iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
216 iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
218 iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
220 iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221 _mm_and_si128(iComplexVal2, highMask));
223 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
225 qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
227 qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
229 qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
231 qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
233 qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
235 qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
237 qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238 _mm_and_si128(qComplexVal2, highMask));
240 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *iBufferPtr++ = *complexVectorPtr++;
249 *qBufferPtr++ = *complexVectorPtr++;
254#ifdef LV_HAVE_GENERIC
259 unsigned int num_points)
261 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
262 int16_t* iBufferPtr = iBuffer;
263 int16_t* qBufferPtr = qBuffer;
265 for (number = 0; number < num_points; number++) {
266 *iBufferPtr++ = *complexVectorPtr++;
267 *qBufferPtr++ = *complexVectorPtr++;
274extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
278static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
281 unsigned int num_points)
283 volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
290#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
291#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
296#include <immintrin.h>
298static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
301 unsigned int num_points)
303 unsigned int number = 0;
304 const int8_t* complexVectorPtr = (int8_t*)complexVector;
305 int16_t* iBufferPtr = iBuffer;
306 int16_t* qBufferPtr = qBuffer;
308 __m256i MoveMask = _mm256_set_epi8(15,
341 __m256i iMove2, iMove1;
342 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
344 unsigned int sixteenthPoints = num_points / 16;
346 for (number = 0; number < sixteenthPoints; number++) {
347 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
348 complexVectorPtr += 32;
349 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
350 complexVectorPtr += 32;
352 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
353 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
355 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
356 _mm256_permute4x64_epi64(iMove2, 0x80),
358 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
359 _mm256_permute4x64_epi64(iMove2, 0xd0),
362 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
363 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
369 number = sixteenthPoints * 16;
370 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
371 for (; number < num_points; number++) {
372 *iBufferPtr++ = *int16ComplexVectorPtr++;
373 *qBufferPtr++ = *int16ComplexVectorPtr++;
379#include <riscv_vector.h>
381static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
384 unsigned int num_points)
386 size_t n = num_points;
387 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
388 vl = __riscv_vsetvl_e16m4(n);
389 vuint32m8_t vc = __riscv_vle32_v_u32m8((
const uint32_t*)complexVector, vl);
390 vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
391 vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
392 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
393 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
399#include <riscv_vector.h>
401static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
404 unsigned int num_points)
406 size_t n = num_points;
407 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
408 vl = __riscv_vsetvl_e16m4(n);
410 __riscv_vlseg2e16_v_u16m4x2((
const uint16_t*)complexVector, vl);
411 vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
412 vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
413 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
414 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
static void volk_16ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:256
static void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:186
static void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t *iBuffer, int16_t *qBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_16i_x2.h:132
short complex lv_16sc_t
Definition volk_complex.h:71