Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_64u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52
53#ifndef INCLUDED_volk_64u_byteswap_u_H
54#define INCLUDED_volk_64u_byteswap_u_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_SSE2
60#include <emmintrin.h>
61
62static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63{
64 uint32_t* inputPtr = (uint32_t*)intsToSwap;
65 __m128i input, byte1, byte2, byte3, byte4, output;
66 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68 uint64_t number = 0;
69 const unsigned int halfPoints = num_points / 2;
70 for (; number < halfPoints; number++) {
71 // Load the 32t values, increment inputPtr later since we're doing it in-place.
72 input = _mm_loadu_si128((__m128i*)inputPtr);
73
74 // Do the four shifts
75 byte1 = _mm_slli_epi32(input, 24);
76 byte2 = _mm_slli_epi32(input, 8);
77 byte3 = _mm_srli_epi32(input, 8);
78 byte4 = _mm_srli_epi32(input, 24);
79 // Or bytes together
80 output = _mm_or_si128(byte1, byte4);
81 byte2 = _mm_and_si128(byte2, byte2mask);
82 output = _mm_or_si128(output, byte2);
83 byte3 = _mm_and_si128(byte3, byte3mask);
84 output = _mm_or_si128(output, byte3);
85
86 // Reorder the two words
87 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Store the results
90 _mm_storeu_si128((__m128i*)inputPtr, output);
91 inputPtr += 4;
92 }
93
94 // Byteswap any remaining points:
95 number = halfPoints * 2;
96 for (; number < num_points; number++) {
97 uint32_t output1 = *inputPtr;
98 uint32_t output2 = inputPtr[1];
99
100 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102
103 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105
106 *inputPtr++ = output2;
107 *inputPtr++ = output1;
108 }
109}
110#endif /* LV_HAVE_SSE2 */
111
112
113#ifdef LV_HAVE_GENERIC
114
115static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
116 unsigned int num_points)
117{
118 uint32_t* inputPtr = (uint32_t*)intsToSwap;
119 unsigned int point;
120 for (point = 0; point < num_points; point++) {
121 uint32_t output1 = *inputPtr;
122 uint32_t output2 = inputPtr[1];
123
124 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
126
127 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
129
130 *inputPtr++ = output2;
131 *inputPtr++ = output1;
132 }
133}
134#endif /* LV_HAVE_GENERIC */
135
136#if LV_HAVE_AVX2
137#include <immintrin.h>
138static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
139{
140 unsigned int number = 0;
141
142 const unsigned int nPerSet = 4;
143 const uint64_t nSets = num_points / nPerSet;
144
145 uint32_t* inputPtr = (uint32_t*)intsToSwap;
146
147 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
148 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
149 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
150
151 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
152
153 for (; number < nSets; number++) {
154
155 // Load the 32t values, increment inputPtr later since we're doing it in-place.
156 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
157 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
158
159 // Store the results
160 _mm256_store_si256((__m256i*)inputPtr, output);
161
162 /* inputPtr is 32bit so increment twice */
163 inputPtr += 2 * nPerSet;
164 }
165
166 // Byteswap any remaining points:
167 for (number = nSets * nPerSet; number < num_points; ++number) {
168 uint32_t output1 = *inputPtr;
169 uint32_t output2 = inputPtr[1];
170 uint32_t out1 =
171 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
172 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
173
174 uint32_t out2 =
175 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
176 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
177 *inputPtr++ = out2;
178 *inputPtr++ = out1;
179 }
180}
181
182#endif /* LV_HAVE_AVX2 */
183
184
185#if LV_HAVE_SSSE3
186#include <tmmintrin.h>
187static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
188 unsigned int num_points)
189{
190 unsigned int number = 0;
191
192 const unsigned int nPerSet = 2;
193 const uint64_t nSets = num_points / nPerSet;
194
195 uint32_t* inputPtr = (uint32_t*)intsToSwap;
196
197 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
198
199 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
200
201 for (; number < nSets; number++) {
202
203 // Load the 32t values, increment inputPtr later since we're doing it in-place.
204 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
205 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
206
207 // Store the results
208 _mm_store_si128((__m128i*)inputPtr, output);
209
210 /* inputPtr is 32bit so increment twice */
211 inputPtr += 2 * nPerSet;
212 }
213
214 // Byteswap any remaining points:
215 for (number = nSets * nPerSet; number < num_points; ++number) {
216 uint32_t output1 = *inputPtr;
217 uint32_t output2 = inputPtr[1];
218 uint32_t out1 =
219 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
220 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
221
222 uint32_t out2 =
223 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
224 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
225 *inputPtr++ = out2;
226 *inputPtr++ = out1;
227 }
228}
229#endif /* LV_HAVE_SSSE3 */
230#endif /* INCLUDED_volk_64u_byteswap_u_H */
231
232
233#ifndef INCLUDED_volk_64u_byteswap_a_H
234#define INCLUDED_volk_64u_byteswap_a_H
235
236#include <inttypes.h>
237#include <stdio.h>
238
239#ifdef LV_HAVE_SSE2
240#include <emmintrin.h>
241
242static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
243{
244 uint32_t* inputPtr = (uint32_t*)intsToSwap;
245 __m128i input, byte1, byte2, byte3, byte4, output;
246 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
247 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
248 uint64_t number = 0;
249 const unsigned int halfPoints = num_points / 2;
250 for (; number < halfPoints; number++) {
251 // Load the 32t values, increment inputPtr later since we're doing it in-place.
252 input = _mm_load_si128((__m128i*)inputPtr);
253
254 // Do the four shifts
255 byte1 = _mm_slli_epi32(input, 24);
256 byte2 = _mm_slli_epi32(input, 8);
257 byte3 = _mm_srli_epi32(input, 8);
258 byte4 = _mm_srli_epi32(input, 24);
259 // Or bytes together
260 output = _mm_or_si128(byte1, byte4);
261 byte2 = _mm_and_si128(byte2, byte2mask);
262 output = _mm_or_si128(output, byte2);
263 byte3 = _mm_and_si128(byte3, byte3mask);
264 output = _mm_or_si128(output, byte3);
265
266 // Reorder the two words
267 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
268
269 // Store the results
270 _mm_store_si128((__m128i*)inputPtr, output);
271 inputPtr += 4;
272 }
273
274 // Byteswap any remaining points:
275 number = halfPoints * 2;
276 for (; number < num_points; number++) {
277 uint32_t output1 = *inputPtr;
278 uint32_t output2 = inputPtr[1];
279
280 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
281 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
282
283 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
284 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
285
286 *inputPtr++ = output2;
287 *inputPtr++ = output1;
288 }
289}
290#endif /* LV_HAVE_SSE2 */
291
292#if LV_HAVE_AVX2
293#include <immintrin.h>
294static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
295{
296 unsigned int number = 0;
297
298 const unsigned int nPerSet = 4;
299 const uint64_t nSets = num_points / nPerSet;
300
301 uint32_t* inputPtr = (uint32_t*)intsToSwap;
302
303 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
304 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
305 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
306
307 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
308
309 for (; number < nSets; number++) {
310 // Load the 32t values, increment inputPtr later since we're doing it in-place.
311 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
312 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
313
314 // Store the results
315 _mm256_storeu_si256((__m256i*)inputPtr, output);
316
317 /* inputPtr is 32bit so increment twice */
318 inputPtr += 2 * nPerSet;
319 }
320
321 // Byteswap any remaining points:
322 for (number = nSets * nPerSet; number < num_points; ++number) {
323 uint32_t output1 = *inputPtr;
324 uint32_t output2 = inputPtr[1];
325 uint32_t out1 =
326 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
327 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
328
329 uint32_t out2 =
330 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
331 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
332 *inputPtr++ = out2;
333 *inputPtr++ = out1;
334 }
335}
336
337#endif /* LV_HAVE_AVX2 */
338
339
340#if LV_HAVE_SSSE3
341#include <tmmintrin.h>
342static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
343 unsigned int num_points)
344{
345 unsigned int number = 0;
346
347 const unsigned int nPerSet = 2;
348 const uint64_t nSets = num_points / nPerSet;
349
350 uint32_t* inputPtr = (uint32_t*)intsToSwap;
351
352 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
353
354 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
355
356 for (; number < nSets; number++) {
357 // Load the 32t values, increment inputPtr later since we're doing it in-place.
358 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
359 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
360
361 // Store the results
362 _mm_storeu_si128((__m128i*)inputPtr, output);
363
364 /* inputPtr is 32bit so increment twice */
365 inputPtr += 2 * nPerSet;
366 }
367
368 // Byteswap any remaining points:
369 for (number = nSets * nPerSet; number < num_points; ++number) {
370 uint32_t output1 = *inputPtr;
371 uint32_t output2 = inputPtr[1];
372 uint32_t out1 =
373 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
374 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
375
376 uint32_t out2 =
377 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
378 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
379 *inputPtr++ = out2;
380 *inputPtr++ = out1;
381 }
382}
383#endif /* LV_HAVE_SSSE3 */
384
385
386#ifdef LV_HAVE_RVV
387#include <riscv_vector.h>
388
389static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points)
390{
391 size_t n = num_points;
392 size_t vlmax = __riscv_vsetvlmax_e8m1();
393 if (vlmax <= 256) {
394 vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
395 __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)),
396 0x0706050403020100 - 0x1020304050607,
397 vlmax / 8));
398 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
399 vl = __riscv_vsetvl_e64m8(n);
400 vuint8m8_t v =
401 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
402 v = RISCV_PERM8(__riscv_vrgather, v, vidx);
403 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
404 }
405 } else {
406 vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax);
407 vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax);
408 vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax);
409 vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax);
410 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
411 vl = __riscv_vsetvl_e64m8(n);
412 vuint8m8_t v =
413 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
414 v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
415 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
416 }
417 }
418}
419#endif /* LV_HAVE_RVV */
420
421#ifdef LV_HAVE_RVA23
422#include <riscv_vector.h>
423
424static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points)
425{
426 size_t n = num_points;
427 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
428 vl = __riscv_vsetvl_e64m8(n);
429 vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl);
430 __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl);
431 }
432}
433#endif /* LV_HAVE_RVA23 */
434
435#endif /* INCLUDED_volk_64u_byteswap_a_H */
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition volk_64u_byteswap.h:187
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition volk_64u_byteswap.h:242
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition volk_64u_byteswap.h:342
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition volk_64u_byteswap.h:62
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition volk_64u_byteswap.h:115
#define RISCV_PERM8(f, v, vidx)
Definition volk_rvv_intrinsics.h:64