Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
39
40#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
41#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
50 const lv_8sc_t* complexVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const int8_t* complexVectorPtr = (int8_t*)complexVector;
55 int8_t* iBufferPtr = iBuffer;
56 __m256i moveMask1 = _mm256_set_epi8(0x80,
57 0x80,
58 0x80,
59 0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 14,
65 12,
66 10,
67 8,
68 6,
69 4,
70 2,
71 0,
72 0x80,
73 0x80,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 14,
81 12,
82 10,
83 8,
84 6,
85 4,
86 2,
87 0);
88 __m256i moveMask2 = _mm256_set_epi8(14,
89 12,
90 10,
91 8,
92 6,
93 4,
94 2,
95 0,
96 0x80,
97 0x80,
98 0x80,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 14,
105 12,
106 10,
107 8,
108 6,
109 4,
110 2,
111 0,
112 0x80,
113 0x80,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80);
120 __m256i complexVal1, complexVal2, outputVal;
121
122 unsigned int thirtysecondPoints = num_points / 32;
123
124 for (number = 0; number < thirtysecondPoints; number++) {
125
126 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
127 complexVectorPtr += 32;
128 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
129 complexVectorPtr += 32;
130
131 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
132 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
133 outputVal = _mm256_or_si256(complexVal1, complexVal2);
134 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
135
136 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
137 iBufferPtr += 32;
138 }
139
140 number = thirtysecondPoints * 32;
141 for (; number < num_points; number++) {
142 *iBufferPtr++ = *complexVectorPtr++;
143 complexVectorPtr++;
144 }
145}
146#endif /* LV_HAVE_AVX2 */
147
148
149#ifdef LV_HAVE_SSSE3
150#include <tmmintrin.h>
151
152static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
153 const lv_8sc_t* complexVector,
154 unsigned int num_points)
155{
156 unsigned int number = 0;
157 const int8_t* complexVectorPtr = (int8_t*)complexVector;
158 int8_t* iBufferPtr = iBuffer;
159 __m128i moveMask1 = _mm_set_epi8(
160 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
161 __m128i moveMask2 = _mm_set_epi8(
162 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
163 __m128i complexVal1, complexVal2, outputVal;
164
165 unsigned int sixteenthPoints = num_points / 16;
166
167 for (number = 0; number < sixteenthPoints; number++) {
168 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
169 complexVectorPtr += 16;
170 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
171 complexVectorPtr += 16;
172
173 complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
174 complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
175
176 outputVal = _mm_or_si128(complexVal1, complexVal2);
177
178 _mm_store_si128((__m128i*)iBufferPtr, outputVal);
179 iBufferPtr += 16;
180 }
181
182 number = sixteenthPoints * 16;
183 for (; number < num_points; number++) {
184 *iBufferPtr++ = *complexVectorPtr++;
185 complexVectorPtr++;
186 }
187}
188#endif /* LV_HAVE_SSSE3 */
189
190
191#ifdef LV_HAVE_AVX
192#include <immintrin.h>
193
194static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
195 const lv_8sc_t* complexVector,
196 unsigned int num_points)
197{
198 unsigned int number = 0;
199 const int8_t* complexVectorPtr = (int8_t*)complexVector;
200 int8_t* iBufferPtr = iBuffer;
201 __m128i moveMaskL = _mm_set_epi8(
202 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
203 __m128i moveMaskH = _mm_set_epi8(
204 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
205 __m256i complexVal1, complexVal2, outputVal;
206 __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
207 outputVal2;
208
209 unsigned int thirtysecondPoints = num_points / 32;
210
211 for (number = 0; number < thirtysecondPoints; number++) {
212
213 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
214 complexVectorPtr += 32;
215 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
216 complexVectorPtr += 32;
217
218 complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
219 complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
220 complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
221 complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
222
223 complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
224 complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
225 outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
226
227
228 complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
229 complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
230 outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
231
232 __m256i dummy = _mm256_setzero_si256();
233 outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
234 outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
235
236
237 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
238 iBufferPtr += 32;
239 }
240
241 number = thirtysecondPoints * 32;
242 for (; number < num_points; number++) {
243 *iBufferPtr++ = *complexVectorPtr++;
244 complexVectorPtr++;
245 }
246}
247#endif /* LV_HAVE_AVX */
248
249
250#ifdef LV_HAVE_GENERIC
251
252static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
253 const lv_8sc_t* complexVector,
254 unsigned int num_points)
255{
256 unsigned int number = 0;
257 const int8_t* complexVectorPtr = (int8_t*)complexVector;
258 int8_t* iBufferPtr = iBuffer;
259 for (number = 0; number < num_points; number++) {
260 *iBufferPtr++ = *complexVectorPtr++;
261 complexVectorPtr++;
262 }
263}
264#endif /* LV_HAVE_GENERIC */
265
266
267#ifdef LV_HAVE_NEON
268#include <arm_neon.h>
269
270static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
271 const lv_8sc_t* complexVector,
272 unsigned int num_points)
273{
274 unsigned int number;
275 unsigned int sixteenth_points = num_points / 16;
276
277 int8x16x2_t input_vector;
278 for (number = 0; number < sixteenth_points; ++number) {
279 input_vector = vld2q_s8((int8_t*)complexVector);
280 vst1q_s8(iBuffer, input_vector.val[0]);
281 iBuffer += 16;
282 complexVector += 16;
283 }
284
285 const int8_t* complexVectorPtr = (int8_t*)complexVector;
286 int8_t* iBufferPtr = iBuffer;
287 for (number = sixteenth_points * 16; number < num_points; number++) {
288 *iBufferPtr++ = *complexVectorPtr++;
289 complexVectorPtr++;
290 }
291}
292#endif /* LV_HAVE_NEON */
293
294
295#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
296
297#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
298#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
299
300#include <inttypes.h>
301#include <stdio.h>
302
303#ifdef LV_HAVE_AVX2
304#include <immintrin.h>
305
306static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
307 const lv_8sc_t* complexVector,
308 unsigned int num_points)
309{
310 unsigned int number = 0;
311 const int8_t* complexVectorPtr = (int8_t*)complexVector;
312 int8_t* iBufferPtr = iBuffer;
313 __m256i moveMask1 = _mm256_set_epi8(0x80,
314 0x80,
315 0x80,
316 0x80,
317 0x80,
318 0x80,
319 0x80,
320 0x80,
321 14,
322 12,
323 10,
324 8,
325 6,
326 4,
327 2,
328 0,
329 0x80,
330 0x80,
331 0x80,
332 0x80,
333 0x80,
334 0x80,
335 0x80,
336 0x80,
337 14,
338 12,
339 10,
340 8,
341 6,
342 4,
343 2,
344 0);
345 __m256i moveMask2 = _mm256_set_epi8(14,
346 12,
347 10,
348 8,
349 6,
350 4,
351 2,
352 0,
353 0x80,
354 0x80,
355 0x80,
356 0x80,
357 0x80,
358 0x80,
359 0x80,
360 0x80,
361 14,
362 12,
363 10,
364 8,
365 6,
366 4,
367 2,
368 0,
369 0x80,
370 0x80,
371 0x80,
372 0x80,
373 0x80,
374 0x80,
375 0x80,
376 0x80);
377 __m256i complexVal1, complexVal2, outputVal;
378
379 unsigned int thirtysecondPoints = num_points / 32;
380
381 for (number = 0; number < thirtysecondPoints; number++) {
382
383 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
384 complexVectorPtr += 32;
385 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
386 complexVectorPtr += 32;
387
388 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
389 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
390 outputVal = _mm256_or_si256(complexVal1, complexVal2);
391 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
392
393 _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
394 iBufferPtr += 32;
395 }
396
397 number = thirtysecondPoints * 32;
398 for (; number < num_points; number++) {
399 *iBufferPtr++ = *complexVectorPtr++;
400 complexVectorPtr++;
401 }
402}
403#endif /* LV_HAVE_AVX2 */
404
405#ifdef LV_HAVE_RVV
406#include <riscv_vector.h>
407
408static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
409 const lv_8sc_t* complexVector,
410 unsigned int num_points)
411{
412 const uint16_t* in = (const uint16_t*)complexVector;
413 size_t n = num_points;
414 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
415 vl = __riscv_vsetvl_e16m8(n);
416 vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl);
417 __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
418 }
419}
420#endif /*LV_HAVE_RVV*/
421
422#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition volk_8ic_deinterleave_real_8i.h:152
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition volk_8ic_deinterleave_real_8i.h:194
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition volk_8ic_deinterleave_real_8i.h:270
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition volk_8ic_deinterleave_real_8i.h:252
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition volk_complex.h:70