Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32u_reverse_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
30#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
31
32// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
33// https://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
34static const unsigned char BitReverseTable256[] = {
35 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
36 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
37 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
38 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
39 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
40 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
41 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
42 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
43 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
44 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
45 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
46 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
47 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
48 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
49 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
50 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
51 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
52 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
53 0x3F, 0xBF, 0x7F, 0xFF
54};
55#ifdef LV_HAVE_GENERIC
56static inline void
57volk_32u_reverse_32u_generic(uint32_t* out, const uint32_t* in, unsigned int num_points)
58{
59 const uint32_t* in_ptr = in;
60 uint32_t* out_ptr = out;
61 unsigned int number = 0;
62 for (; number < num_points; ++number) {
63 *out_ptr = (((*in_ptr >> 31) & 1) << 0) | (((*in_ptr >> 30) & 1) << 1) |
64 (((*in_ptr >> 29) & 1) << 2) | (((*in_ptr >> 28) & 1) << 3) |
65 (((*in_ptr >> 27) & 1) << 4) | (((*in_ptr >> 26) & 1) << 5) |
66 (((*in_ptr >> 25) & 1) << 6) | (((*in_ptr >> 24) & 1) << 7) |
67 (((*in_ptr >> 23) & 1) << 8) | (((*in_ptr >> 22) & 1) << 9) |
68 (((*in_ptr >> 21) & 1) << 10) | (((*in_ptr >> 20) & 1) << 11) |
69 (((*in_ptr >> 19) & 1) << 12) | (((*in_ptr >> 18) & 1) << 13) |
70 (((*in_ptr >> 17) & 1) << 14) | (((*in_ptr >> 16) & 1) << 15) |
71 (((*in_ptr >> 15) & 1) << 16) | (((*in_ptr >> 14) & 1) << 17) |
72 (((*in_ptr >> 13) & 1) << 18) | (((*in_ptr >> 12) & 1) << 19) |
73 (((*in_ptr >> 11) & 1) << 20) | (((*in_ptr >> 10) & 1) << 21) |
74 (((*in_ptr >> 9) & 1) << 22) | (((*in_ptr >> 8) & 1) << 23) |
75 (((*in_ptr >> 7) & 1) << 24) | (((*in_ptr >> 6) & 1) << 25) |
76 (((*in_ptr >> 5) & 1) << 26) | (((*in_ptr >> 4) & 1) << 27) |
77 (((*in_ptr >> 3) & 1) << 28) | (((*in_ptr >> 2) & 1) << 29) |
78 (((*in_ptr >> 1) & 1) << 30) | (((*in_ptr >> 0) & 1) << 31);
79 ++in_ptr;
80 ++out_ptr;
81 }
82}
83#endif /* LV_HAVE_GENERIC */
84
85#ifdef LV_HAVE_GENERIC
86static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
87 const uint32_t* in,
88 unsigned int num_points)
89{
90 const uint32_t* in_ptr = in;
91 uint32_t* out_ptr = out;
92 unsigned int number = 0;
93 for (; number < num_points; ++number) {
94 const uint8_t* in8 = (const uint8_t*)in_ptr;
95 uint8_t* out8 = (uint8_t*)out_ptr;
96
97 out8[3] = (((in8[0] >> 7) & 1) << 0) | (((in8[0] >> 6) & 1) << 1) |
98 (((in8[0] >> 5) & 1) << 2) | (((in8[0] >> 4) & 1) << 3) |
99 (((in8[0] >> 3) & 1) << 4) | (((in8[0] >> 2) & 1) << 5) |
100 (((in8[0] >> 1) & 1) << 6) | (((in8[0] >> 0) & 1) << 7);
101
102 out8[2] = (((in8[1] >> 7) & 1) << 0) | (((in8[1] >> 6) & 1) << 1) |
103 (((in8[1] >> 5) & 1) << 2) | (((in8[1] >> 4) & 1) << 3) |
104 (((in8[1] >> 3) & 1) << 4) | (((in8[1] >> 2) & 1) << 5) |
105 (((in8[1] >> 1) & 1) << 6) | (((in8[1] >> 0) & 1) << 7);
106
107 out8[1] = (((in8[2] >> 7) & 1) << 0) | (((in8[2] >> 6) & 1) << 1) |
108 (((in8[2] >> 5) & 1) << 2) | (((in8[2] >> 4) & 1) << 3) |
109 (((in8[2] >> 3) & 1) << 4) | (((in8[2] >> 2) & 1) << 5) |
110 (((in8[2] >> 1) & 1) << 6) | (((in8[2] >> 0) & 1) << 7);
111
112 out8[0] = (((in8[3] >> 7) & 1) << 0) | (((in8[3] >> 6) & 1) << 1) |
113 (((in8[3] >> 5) & 1) << 2) | (((in8[3] >> 4) & 1) << 3) |
114 (((in8[3] >> 3) & 1) << 4) | (((in8[3] >> 2) & 1) << 5) |
115 (((in8[3] >> 1) & 1) << 6) | (((in8[3] >> 0) & 1) << 7);
116 ++in_ptr;
117 ++out_ptr;
118 }
119}
120#endif /* LV_HAVE_GENERIC */
121
122// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
123// https://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
124#ifdef LV_HAVE_GENERIC
125static inline void
126volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
127{
128 const uint32_t* in_ptr = in;
129 uint32_t* out_ptr = out;
130 unsigned int number = 0;
131 for (; number < num_points; ++number) {
132 *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
133 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
134 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
135 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
136 ++in_ptr;
137 ++out_ptr;
138 }
139}
140#endif /* LV_HAVE_GENERIC */
141
142// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
143// domain https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
144#ifdef LV_HAVE_GENERIC
145static inline void
146volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
147{
148 const uint32_t* in_ptr = in;
149 uint32_t* out_ptr = out;
150 const uint8_t* in8;
151 uint8_t* out8;
152 unsigned int number = 0;
153 for (; number < num_points; ++number) {
154 in8 = (const uint8_t*)in_ptr;
155 out8 = (uint8_t*)out_ptr;
156 out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
157 out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
158 out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
159 out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
160 ++in_ptr;
161 ++out_ptr;
162 }
163}
164#endif /* LV_HAVE_GENERIC */
165
166#ifdef LV_HAVE_GENERIC
167// Current gr-pager implementation
168static inline void
169volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
170{
171 const uint32_t* in_ptr = in;
172 uint32_t* out_ptr = out;
173 const uint8_t* in8;
174 uint8_t* out8;
175 unsigned int number = 0;
176 for (; number < num_points; ++number) {
177 in8 = (const uint8_t*)in_ptr;
178 out8 = (uint8_t*)out_ptr;
179 out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
180 out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
181 out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
182 out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
183 ++in_ptr;
184 ++out_ptr;
185 }
186}
187#endif /* LV_HAVE_GENERIC */
188
189// After lengthy thought and quite a bit of whiteboarding:
190#ifdef LV_HAVE_GENERIC
191static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
192 const uint32_t* in,
193 unsigned int num_points)
194{
195 const uint32_t* in_ptr = in;
196 uint32_t* out_ptr = out;
197 unsigned int number = 0;
198 for (; number < num_points; ++number) {
199 uint32_t tmp = *in_ptr;
200 /* permute uint16:
201 The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
202 */
203 tmp = (tmp << 16) | (tmp >> 16);
204 /* permute bytes:
205 shift up by 1 B first, then only consider even bytes, and OR with the unshifted
206 even bytes
207 */
208 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
209 /* permute 4bit tuples:
210 Same idea, but the "consideration" mask expression becomes unwieldy
211 */
212 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
213 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
214 /* permute 2bit tuples:
215 Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
216 3; we need those every 4b, which coincides with a hex digit!
217 */
218 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
219 /* permute odd/even:
220 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
221 0x05!
222 */
223 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
224
225 *out_ptr = tmp;
226 ++in_ptr;
227 ++out_ptr;
228 }
229}
230#endif /* LV_HAVE_GENERIC */
231#ifdef LV_HAVE_GENERIC
232static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
233 const uint32_t* in,
234 unsigned int num_points)
235{
236 // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
237 const uint32_t* in_ptr = in;
238 uint32_t* out_ptr = out;
239 unsigned int number = 0;
240 for (; number < num_points; ++number) {
241 uint32_t tmp = *in_ptr;
242 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
243 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
244 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
245 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
246 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
247 tmp = (tmp << 16) | (tmp >> 16);
248
249 *out_ptr = tmp;
250 ++in_ptr;
251 ++out_ptr;
252 }
253}
254#endif /* LV_HAVE_GENERIC */
255
256#ifdef LV_HAVE_NEONV8
257#include <arm_neon.h>
258
259static inline void
260volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
261{
262 const uint32_t* in_ptr = in;
263 uint32_t* out_ptr = out;
264
265 const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
266
267 const unsigned int quarterPoints = num_points / 4;
268 unsigned int number = 0;
269 for (; number < quarterPoints; ++number) {
270 __VOLK_PREFETCH(in_ptr + 4);
271 uint32x4_t x = vld1q_u32(in_ptr);
272 uint32x4_t z =
273 vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
274 vst1q_u32(out_ptr, z);
275 in_ptr += 4;
276 out_ptr += 4;
277 }
278 number = quarterPoints * 4;
279 for (; number < num_points; ++number) {
280 *out_ptr = ((uint32_t)BitReverseTable256[*in_ptr & 0xff] << 24) |
281 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
282 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
283 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
284 ++in_ptr;
285 ++out_ptr;
286 }
287}
288
289#endif /* LV_HAVE_NEONV8 */
290
291#ifdef LV_HAVE_NEON
292#include <arm_neon.h>
293
294#if defined(__aarch64__)
295#define DO_RBIT \
296 __VOLK_ASM("rbit %w[result], %w[value]" \
297 : [result] "=r"(*out_ptr) \
298 : [value] "r"(*in_ptr) \
299 :); \
300 in_ptr++; \
301 out_ptr++;
302#else
303#define DO_RBIT \
304 __VOLK_ASM("rbit %[result], %[value]" \
305 : [result] "=r"(*out_ptr) \
306 : [value] "r"(*in_ptr) \
307 :); \
308 in_ptr++; \
309 out_ptr++;
310#endif
311
312static inline void
313volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
314{
315
316 const uint32_t* in_ptr = in;
317 uint32_t* out_ptr = out;
318 const unsigned int eighthPoints = num_points / 8;
319 unsigned int number = 0;
320 for (; number < eighthPoints; ++number) {
321 __VOLK_PREFETCH(in_ptr + 8);
322 DO_RBIT;
323 DO_RBIT;
324 DO_RBIT;
325 DO_RBIT;
326 DO_RBIT;
327 DO_RBIT;
328 DO_RBIT;
329 DO_RBIT;
330 }
331 number = eighthPoints * 8;
332 for (; number < num_points; ++number) {
333 DO_RBIT;
334 }
335}
336#undef DO_RBIT
337#endif /* LV_HAVE_NEON */
338
339
340#ifdef LV_HAVE_RVV
341#include <riscv_vector.h>
342
343static inline void
344volk_32u_reverse_32u_rvv(uint32_t* out, const uint32_t* in, unsigned int num_points)
345{
346 size_t n = num_points;
347
348 static const uint64_t tblLo[] = {
349 0xE060A020C0408000,
350 0xF070B030D0509010,
351 };
352 static const uint64_t tblHi[] = {
353 0x0E060A020C040800,
354 0x0F070B030D050901,
355 };
356 vuint8m1_t vtblLo = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblLo, 2));
357 vuint8m1_t vtblHi = __riscv_vreinterpret_u8m1(__riscv_vle64_v_u64m1(tblHi, 2));
358
359 size_t vlmax = __riscv_vsetvlmax_e8m1();
360 vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
361 __riscv_vsub(__riscv_vreinterpret_u64m2(__riscv_vid_v_u16m2(vlmax)),
362 0x3000200010000 - 0x100020003,
363 vlmax / 4));
364 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
365 vl = __riscv_vsetvl_e32m4(n);
366 vuint8m4_t v = __riscv_vreinterpret_u8m4(__riscv_vle32_v_u32m4(in, vl));
367 v = RISCV_PERM4(__riscv_vrgatherei16, v, vidx);
368 vuint8m4_t lo = __riscv_vand(v, 0xF, vl * 4);
369 lo = RISCV_LUT4(__riscv_vrgather, vtblLo, lo);
370 vuint8m4_t hi = __riscv_vsrl(v, 4, vl * 4);
371 hi = RISCV_LUT4(__riscv_vrgather, vtblHi, hi);
372 v = __riscv_vor(hi, lo, vl * 4);
373 __riscv_vse32(out, __riscv_vreinterpret_u32m4(v), vl);
374 }
375}
376#endif /* LV_HAVE_RVV */
377
378#ifdef LV_HAVE_RVA23
379#include <riscv_vector.h>
380
381static inline void
382volk_32u_reverse_32u_rva23(uint32_t* out, const uint32_t* in, unsigned int num_points)
383{
384 size_t n = num_points;
385 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
386 vl = __riscv_vsetvl_e32m8(n);
387 vuint32m8_t v = __riscv_vle32_v_u32m8(in, vl);
388 __riscv_vse32(out, __riscv_vbrev(v, vl), vl);
389 }
390}
391#endif /* LV_HAVE_RVA23 */
392
393#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
static void volk_32u_reverse_32u_1972magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:169
static void volk_32u_reverse_32u_2001magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:146
static void volk_32u_reverse_32u_lut(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:126
#define DO_RBIT
Definition volk_32u_reverse_32u.h:303
static const unsigned char BitReverseTable256[]
Definition volk_32u_reverse_32u.h:34
static void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:232
static void volk_32u_reverse_32u_generic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:57
static void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:191
static void volk_32u_reverse_32u_arm(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:313
static void volk_32u_reverse_32u_byte_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition volk_32u_reverse_32u.h:86
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
#define RISCV_PERM4(f, v, vidx)
Definition volk_rvv_intrinsics.h:50
#define RISCV_LUT4(f, vtbl, v)
Definition volk_rvv_intrinsics.h:57