Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47
48#ifdef LV_HAVE_AVX2
49#include <immintrin.h>
50
51static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
52 const lv_16sc_t* complexVector,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 int8_t* iBufferPtr = iBuffer;
58 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
59 0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 0x80,
65 0x80,
66 13,
67 12,
68 9,
69 8,
70 5,
71 4,
72 1,
73 0,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 0x80,
81 0x80,
82 13,
83 12,
84 9,
85 8,
86 5,
87 4,
88 1,
89 0);
90 __m256i iMoveMask2 = _mm256_set_epi8(13,
91 12,
92 9,
93 8,
94 5,
95 4,
96 1,
97 0,
98 0x80,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 0x80,
105 0x80,
106 13,
107 12,
108 9,
109 8,
110 5,
111 4,
112 1,
113 0,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 0x80,
121 0x80);
122 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
123
124 unsigned int thirtysecondPoints = num_points / 32;
125
126 for (number = 0; number < thirtysecondPoints; number++) {
127 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128 complexVectorPtr += 32;
129 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 32;
131
132 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133 complexVectorPtr += 32;
134 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135 complexVectorPtr += 32;
136
137 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
139
140 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
142
143 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
145
146 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
148
149 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
151
152 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
154
155 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
156
157 iBufferPtr += 32;
158 }
159
160 number = thirtysecondPoints * 32;
161 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162 for (; number < num_points; number++) {
163 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164 int16ComplexVectorPtr++;
165 }
166}
167#endif /* LV_HAVE_AVX2 */
168
169
170#ifdef LV_HAVE_SSSE3
171#include <tmmintrin.h>
172
173static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
174 const lv_16sc_t* complexVector,
175 unsigned int num_points)
176{
177 unsigned int number = 0;
178 const int8_t* complexVectorPtr = (int8_t*)complexVector;
179 int8_t* iBufferPtr = iBuffer;
180 __m128i iMoveMask1 = _mm_set_epi8(
181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182 __m128i iMoveMask2 = _mm_set_epi8(
183 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
185
186 unsigned int sixteenthPoints = num_points / 16;
187
188 for (number = 0; number < sixteenthPoints; number++) {
189 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190 complexVectorPtr += 16;
191 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192 complexVectorPtr += 16;
193
194 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195 complexVectorPtr += 16;
196 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197 complexVectorPtr += 16;
198
199 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
201
202 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
203
204 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
206
207 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
208
209
210 complexVal1 = _mm_srai_epi16(complexVal1, 8);
211 complexVal3 = _mm_srai_epi16(complexVal3, 8);
212
213 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
214
215 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
216
217 iBufferPtr += 16;
218 }
219
220 number = sixteenthPoints * 16;
221 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222 for (; number < num_points; number++) {
223 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224 int16ComplexVectorPtr++;
225 }
226}
227#endif /* LV_HAVE_SSSE3 */
228
229#ifdef LV_HAVE_GENERIC
230
231static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
232 const lv_16sc_t* complexVector,
233 unsigned int num_points)
234{
235 unsigned int number = 0;
236 int16_t* complexVectorPtr = (int16_t*)complexVector;
237 int8_t* iBufferPtr = iBuffer;
238 for (number = 0; number < num_points; number++) {
239 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
240 complexVectorPtr++;
241 }
242}
243#endif /* LV_HAVE_GENERIC */
244
245#ifdef LV_HAVE_NEON
246#include <arm_neon.h>
247
248static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
249 const lv_16sc_t* complexVector,
250 unsigned int num_points)
251{
252 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
253 int8_t* iBufferPtr = iBuffer;
254 unsigned int eighth_points = num_points / 8;
255 unsigned int number;
256
257 int16x8x2_t complexInput;
258 int8x8_t realOutput;
259 for (number = 0; number < eighth_points; number++) {
260 complexInput = vld2q_s16(complexVectorPtr);
261 realOutput = vshrn_n_s16(complexInput.val[0], 8);
262 vst1_s8(iBufferPtr, realOutput);
263 complexVectorPtr += 16;
264 iBufferPtr += 8;
265 }
266
267 for (number = eighth_points * 8; number < num_points; number++) {
268 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
269 complexVectorPtr++;
270 }
271}
272#endif
273
274#ifdef LV_HAVE_ORC
275
276extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
277 const lv_16sc_t* complexVector,
278 int num_points);
279
280static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
281 const lv_16sc_t* complexVector,
282 unsigned int num_points)
283{
284 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
285}
286#endif /* LV_HAVE_ORC */
287
288
289#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
290
291#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
292#define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
293
294#include <inttypes.h>
295#include <stdio.h>
296
297
298#ifdef LV_HAVE_AVX2
299#include <immintrin.h>
300
301static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
302 const lv_16sc_t* complexVector,
303 unsigned int num_points)
304{
305 unsigned int number = 0;
306 const int8_t* complexVectorPtr = (int8_t*)complexVector;
307 int8_t* iBufferPtr = iBuffer;
308 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
309 0x80,
310 0x80,
311 0x80,
312 0x80,
313 0x80,
314 0x80,
315 0x80,
316 13,
317 12,
318 9,
319 8,
320 5,
321 4,
322 1,
323 0,
324 0x80,
325 0x80,
326 0x80,
327 0x80,
328 0x80,
329 0x80,
330 0x80,
331 0x80,
332 13,
333 12,
334 9,
335 8,
336 5,
337 4,
338 1,
339 0);
340 __m256i iMoveMask2 = _mm256_set_epi8(13,
341 12,
342 9,
343 8,
344 5,
345 4,
346 1,
347 0,
348 0x80,
349 0x80,
350 0x80,
351 0x80,
352 0x80,
353 0x80,
354 0x80,
355 0x80,
356 13,
357 12,
358 9,
359 8,
360 5,
361 4,
362 1,
363 0,
364 0x80,
365 0x80,
366 0x80,
367 0x80,
368 0x80,
369 0x80,
370 0x80,
371 0x80);
372 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
373
374 unsigned int thirtysecondPoints = num_points / 32;
375
376 for (number = 0; number < thirtysecondPoints; number++) {
377 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378 complexVectorPtr += 32;
379 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
380 complexVectorPtr += 32;
381
382 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
383 complexVectorPtr += 32;
384 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
385 complexVectorPtr += 32;
386
387 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
388 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
389
390 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
391 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
392
393 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
394 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
395
396 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
397 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
398
399 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
400 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
401
402 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
403 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
404
405 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
406
407 iBufferPtr += 32;
408 }
409
410 number = thirtysecondPoints * 32;
411 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
412 for (; number < num_points; number++) {
413 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
414 int16ComplexVectorPtr++;
415 }
416}
417#endif /* LV_HAVE_AVX2 */
418
419
420#ifdef LV_HAVE_RVV
421#include <riscv_vector.h>
422
423static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
424 const lv_16sc_t* complexVector,
425 unsigned int num_points)
426{
427 const uint32_t* in = (const uint32_t*)complexVector;
428 size_t n = num_points;
429 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
430 vl = __riscv_vsetvl_e32m8(n);
431 vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
432 __riscv_vse8(
433 (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl);
434 }
435}
436#endif /*LV_HAVE_RVV*/
437
438#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_real_8i.h:231
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_real_8i.h:248
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition volk_16ic_deinterleave_real_8i.h:173
short complex lv_16sc_t
Definition volk_complex.h:71