40#ifndef INCLUDED_volk_16u_byteswap_u_H
41#define INCLUDED_volk_16u_byteswap_u_H
49 unsigned int num_points)
51 uint16_t* inputPtr = intsToSwap;
52 for (
unsigned int point = 0; point < num_points; point++) {
53 uint16_t output = *inputPtr;
54 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
64static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap,
unsigned int num_points)
68 const unsigned int nPerSet = 16;
69 const uint64_t nSets = num_points / nPerSet;
71 uint16_t* inputPtr = (uint16_t*)intsToSwap;
73 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
74 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
75 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
77 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
79 for (number = 0; number < nSets; number++) {
81 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
82 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
85 _mm256_store_si256((__m256i*)inputPtr, output);
90 for (number = nPerSet * nSets; number < num_points; number++) {
91 uint16_t outputVal = *inputPtr;
92 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
93 *inputPtr = outputVal;
101#include <immintrin.h>
102static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap,
unsigned int num_points)
106 const unsigned int nPerSet = 16;
107 const uint64_t nSets = num_points / nPerSet;
109 uint16_t* inputPtr = (uint16_t*)intsToSwap;
111 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
112 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
113 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
115 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
117 for (number = 0; number < nSets; number++) {
119 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
120 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
123 _mm256_storeu_si256((__m256i*)inputPtr, output);
128 for (number = nPerSet * nSets; number < num_points; number++) {
129 uint16_t outputVal = *inputPtr;
130 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
131 *inputPtr = outputVal;
139#include <emmintrin.h>
143 unsigned int number = 0;
144 uint16_t* inputPtr = intsToSwap;
145 __m128i input, left, right, output;
147 const unsigned int eighthPoints = num_points / 8;
148 for (; number < eighthPoints; number++) {
150 input = _mm_loadu_si128((__m128i*)inputPtr);
152 left = _mm_slli_epi16(input, 8);
153 right = _mm_srli_epi16(input, 8);
155 output = _mm_or_si128(left, right);
157 _mm_storeu_si128((__m128i*)inputPtr, output);
162 number = eighthPoints * 8;
163 for (; number < num_points; number++) {
164 uint16_t outputVal = *inputPtr;
165 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
166 *inputPtr = outputVal;
174#ifndef INCLUDED_volk_16u_byteswap_a_H
175#define INCLUDED_volk_16u_byteswap_a_H
181#include <emmintrin.h>
185 uint16_t* inputPtr = intsToSwap;
186 __m128i input, left, right, output;
188 const unsigned int eighthPoints = num_points / 8;
189 for (
unsigned int number = 0; number < eighthPoints; number++) {
191 input = _mm_load_si128((__m128i*)inputPtr);
193 left = _mm_slli_epi16(input, 8);
194 right = _mm_srli_epi16(input, 8);
196 output = _mm_or_si128(left, right);
198 _mm_store_si128((__m128i*)inputPtr, output);
213 unsigned int eighth_points = num_points / 8;
214 uint16x8_t input, output;
215 uint16_t* inputPtr = intsToSwap;
217 for (number = 0; number < eighth_points; number++) {
218 input = vld1q_u16(inputPtr);
219 output = vsriq_n_u16(output, input, 8);
220 output = vsliq_n_u16(output, input, 8);
221 vst1q_u16(inputPtr, output);
233 unsigned int num_points)
235 uint16_t* inputPtr = intsToSwap;
236 unsigned int number = 0;
237 unsigned int n16points = num_points / 16;
239 uint8x8x4_t input_table;
240 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
241 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
251 int_lookup01 = vcreate_u8(1232017111498883080);
252 int_lookup23 = vcreate_u8(1376697457175036426);
253 int_lookup45 = vcreate_u8(1521377802851189772);
254 int_lookup67 = vcreate_u8(1666058148527343118);
256 for (number = 0; number < n16points; ++number) {
257 input_table = vld4_u8((uint8_t*)inputPtr);
258 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
259 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
260 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
261 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
262 vst1_u8((uint8_t*)inputPtr, swapped_int01);
263 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
264 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
265 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
276extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap,
int num_points);
277static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap,
unsigned int num_points)
279 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
284#include <riscv_vector.h>
287static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap,
unsigned int num_points)
289 size_t n = num_points;
290 size_t vlmax = __riscv_vsetvlmax_e8m1();
292 vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
293 __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)),
296 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
297 vl = __riscv_vsetvl_e16m8(n);
299 __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
301 __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
304 vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
305 __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)),
308 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
309 vl = __riscv_vsetvl_e16m8(n);
311 __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
313 __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
320#include <riscv_vector.h>
322static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap,
unsigned int num_points)
324 size_t n = num_points;
325 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
326 vl = __riscv_vsetvl_e16m8(n);
327 vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl);
328 __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl);
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition volk_16u_byteswap.h:141
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition volk_16u_byteswap.h:210
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition volk_16u_byteswap.h:183
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition volk_16u_byteswap.h:48
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition volk_16u_byteswap.h:232
#define RISCV_PERM8(f, v, vidx)
Definition volk_rvv_intrinsics.h:64