Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/*
11 * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12 */
13
14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16#include <string.h>
17
18static inline unsigned int log2_of_power_of_2(unsigned int val)
19{
20 // algorithm from: https://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23 };
24
25 unsigned int res = (val & b[0]) != 0;
26 res |= ((val & b[4]) != 0) << 4;
27 res |= ((val & b[3]) != 0) << 3;
28 res |= ((val & b[2]) != 0) << 2;
29 res |= ((val & b[1]) != 0) << 1;
30 return res;
31}
32
33static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
37{
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
43 ++frame_ptr;
44 temp_ptr += 2;
45 }
46 frame_ptr += frame_half;
47 }
48}
49
50#ifdef LV_HAVE_GENERIC
51
52static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
53 unsigned char* temp,
54 unsigned int frame_size)
55{
56 unsigned int stage = log2_of_power_of_2(frame_size);
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
59
60 while (stage) {
61 // encode stage
62 encodepolar_single_stage(frame, temp, num_branches, frame_half);
63 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
64
65 // update all the parameters.
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
68 --stage;
69 }
70}
71#endif /* LV_HAVE_GENERIC */
72
73#ifdef LV_HAVE_SSSE3
74#include <tmmintrin.h>
75
76static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
77 unsigned char* temp,
78 unsigned int frame_size)
79{
80 if (frame_size < 16) {
81 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
82 return;
83 }
84
85 const unsigned int po2 = log2_of_power_of_2(frame_size);
86
87 unsigned int stage = po2;
88 unsigned char* frame_ptr = frame;
89 unsigned char* temp_ptr = temp;
90
91 unsigned int frame_half = frame_size >> 1;
92 unsigned int num_branches = 1;
93 unsigned int branch;
94 unsigned int bit;
95
96 // prepare constants
97 const __m128i mask_stage1 = _mm_set_epi8(0x0,
98 0xFF,
99 0x0,
100 0xFF,
101 0x0,
102 0xFF,
103 0x0,
104 0xFF,
105 0x0,
106 0xFF,
107 0x0,
108 0xFF,
109 0x0,
110 0xFF,
111 0x0,
112 0xFF);
113
114 // get some SIMD registers to play with.
115 __m128i r_frame0, r_temp0, shifted;
116
117 {
118 __m128i r_frame1, r_temp1;
119 const __m128i shuffle_separate =
120 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
121
122 while (stage > 4) {
123 frame_ptr = frame;
124 temp_ptr = temp;
125
126 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
127 for (branch = 0; branch < num_branches; ++branch) {
128 for (bit = 0; bit < frame_half; bit += 16) {
129 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
130 temp_ptr += 16;
131 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
132 temp_ptr += 16;
133
134 shifted = _mm_srli_si128(r_temp0, 1);
135 shifted = _mm_and_si128(shifted, mask_stage1);
136 r_temp0 = _mm_xor_si128(shifted, r_temp0);
137 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
138
139 shifted = _mm_srli_si128(r_temp1, 1);
140 shifted = _mm_and_si128(shifted, mask_stage1);
141 r_temp1 = _mm_xor_si128(shifted, r_temp1);
142 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
143
144 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
145 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
146
147 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
148 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
149 frame_ptr += 16;
150 }
151
152 frame_ptr += frame_half;
153 }
154 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
155
156 num_branches = num_branches << 1;
157 frame_half = frame_half >> 1;
158 stage--;
159 }
160 }
161
162 // This last part requires at least 16-bit frames.
163 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
164
165 // reset pointers to correct positions.
166 frame_ptr = frame;
167 temp_ptr = temp;
168
169 // prefetch first chunk
170 __VOLK_PREFETCH(temp_ptr);
171
172 const __m128i shuffle_stage4 =
173 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
174 const __m128i mask_stage4 = _mm_set_epi8(0x0,
175 0x0,
176 0x0,
177 0x0,
178 0x0,
179 0x0,
180 0x0,
181 0x0,
182 0xFF,
183 0xFF,
184 0xFF,
185 0xFF,
186 0xFF,
187 0xFF,
188 0xFF,
189 0xFF);
190 const __m128i mask_stage3 = _mm_set_epi8(0x0,
191 0x0,
192 0x0,
193 0x0,
194 0xFF,
195 0xFF,
196 0xFF,
197 0xFF,
198 0x0,
199 0x0,
200 0x0,
201 0x0,
202 0xFF,
203 0xFF,
204 0xFF,
205 0xFF);
206 const __m128i mask_stage2 = _mm_set_epi8(0x0,
207 0x0,
208 0xFF,
209 0xFF,
210 0x0,
211 0x0,
212 0xFF,
213 0xFF,
214 0x0,
215 0x0,
216 0xFF,
217 0xFF,
218 0x0,
219 0x0,
220 0xFF,
221 0xFF);
222
223 for (branch = 0; branch < num_branches; ++branch) {
224 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
225
226 // prefetch next chunk
227 temp_ptr += 16;
228 __VOLK_PREFETCH(temp_ptr);
229
230 // shuffle once for bit-reversal.
231 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
232
233 shifted = _mm_srli_si128(r_temp0, 8);
234 shifted = _mm_and_si128(shifted, mask_stage4);
235 r_frame0 = _mm_xor_si128(shifted, r_temp0);
236
237 shifted = _mm_srli_si128(r_frame0, 4);
238 shifted = _mm_and_si128(shifted, mask_stage3);
239 r_frame0 = _mm_xor_si128(shifted, r_frame0);
240
241 shifted = _mm_srli_si128(r_frame0, 2);
242 shifted = _mm_and_si128(shifted, mask_stage2);
243 r_frame0 = _mm_xor_si128(shifted, r_frame0);
244
245 shifted = _mm_srli_si128(r_frame0, 1);
246 shifted = _mm_and_si128(shifted, mask_stage1);
247 r_frame0 = _mm_xor_si128(shifted, r_frame0);
248
249 // store result of chunk.
250 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
251 frame_ptr += 16;
252 }
253}
254
255#endif /* LV_HAVE_SSSE3 */
256
257#ifdef LV_HAVE_AVX2
258#include <immintrin.h>
259
260static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
261 unsigned char* temp,
262 unsigned int frame_size)
263{
264 if (frame_size < 32) {
265 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
266 return;
267 }
268
269 const unsigned int po2 = log2_of_power_of_2(frame_size);
270
271 unsigned int stage = po2;
272 unsigned char* frame_ptr = frame;
273 unsigned char* temp_ptr = temp;
274
275 unsigned int frame_half = frame_size >> 1;
276 unsigned int num_branches = 1;
277 unsigned int branch;
278 unsigned int bit;
279
280 // prepare constants
281 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
282 0xFF,
283 0x0,
284 0xFF,
285 0x0,
286 0xFF,
287 0x0,
288 0xFF,
289 0x0,
290 0xFF,
291 0x0,
292 0xFF,
293 0x0,
294 0xFF,
295 0x0,
296 0xFF,
297 0x0,
298 0xFF,
299 0x0,
300 0xFF,
301 0x0,
302 0xFF,
303 0x0,
304 0xFF,
305 0x0,
306 0xFF,
307 0x0,
308 0xFF,
309 0x0,
310 0xFF,
311 0x0,
312 0xFF);
313
314 const __m128i mask_stage0 = _mm_set_epi8(0x0,
315 0xFF,
316 0x0,
317 0xFF,
318 0x0,
319 0xFF,
320 0x0,
321 0xFF,
322 0x0,
323 0xFF,
324 0x0,
325 0xFF,
326 0x0,
327 0xFF,
328 0x0,
329 0xFF);
330 // get some SIMD registers to play with.
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
333 {
334 __m256i r_frame1, r_temp1;
335 __m128i r_frame3, r_temp3;
336 const __m256i shuffle_separate = _mm256_setr_epi8(0,
337 2,
338 4,
339 6,
340 8,
341 10,
342 12,
343 14,
344 1,
345 3,
346 5,
347 7,
348 9,
349 11,
350 13,
351 15,
352 0,
353 2,
354 4,
355 6,
356 8,
357 10,
358 12,
359 14,
360 1,
361 3,
362 5,
363 7,
364 9,
365 11,
366 13,
367 15);
368 const __m128i shuffle_separate128 =
369 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
370
371 while (stage > 4) {
372 frame_ptr = frame;
373 temp_ptr = temp;
374
375 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
376 for (branch = 0; branch < num_branches; ++branch) {
377 for (bit = 0; bit < frame_half; bit += 32) {
378 if ((frame_half - bit) <
379 32) // if only 16 bits remaining in frame, not 32
380 {
381 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
382 temp_ptr += 16;
383 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
384 temp_ptr += 16;
385
386 shifted2 = _mm_srli_si128(r_temp2, 1);
387 shifted2 = _mm_and_si128(shifted2, mask_stage0);
388 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
390
391 shifted2 = _mm_srli_si128(r_temp3, 1);
392 shifted2 = _mm_and_si128(shifted2, mask_stage0);
393 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
395
396 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
398
399 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
401 frame_ptr += 16;
402 break;
403 }
404 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
405 temp_ptr += 32;
406 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
407 temp_ptr += 32;
408
409 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
410 shifted = _mm256_and_si256(shifted, mask_stage1);
411 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
413
414 shifted = _mm256_srli_si256(r_temp1, 1);
415 shifted = _mm256_and_si256(shifted, mask_stage1);
416 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
418
419 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
423
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
425
426 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
427 frame_ptr += 32;
428 }
429
430 frame_ptr += frame_half;
431 }
432 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
433
434 num_branches = num_branches << 1;
435 frame_half = frame_half >> 1;
436 stage--;
437 }
438 }
439
440 // This last part requires at least 32-bit frames.
441 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
442
443 // reset pointers to correct positions.
444 frame_ptr = frame;
445 temp_ptr = temp;
446
447 // prefetch first chunk
448 __VOLK_PREFETCH(temp_ptr);
449
450 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
451 8,
452 4,
453 12,
454 2,
455 10,
456 6,
457 14,
458 1,
459 9,
460 5,
461 13,
462 3,
463 11,
464 7,
465 15,
466 0,
467 8,
468 4,
469 12,
470 2,
471 10,
472 6,
473 14,
474 1,
475 9,
476 5,
477 13,
478 3,
479 11,
480 7,
481 15);
482 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
483 0x0,
484 0x0,
485 0x0,
486 0x0,
487 0x0,
488 0x0,
489 0x0,
490 0xFF,
491 0xFF,
492 0xFF,
493 0xFF,
494 0xFF,
495 0xFF,
496 0xFF,
497 0xFF,
498 0x0,
499 0x0,
500 0x0,
501 0x0,
502 0x0,
503 0x0,
504 0x0,
505 0x0,
506 0xFF,
507 0xFF,
508 0xFF,
509 0xFF,
510 0xFF,
511 0xFF,
512 0xFF,
513 0xFF);
514 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
515 0x0,
516 0x0,
517 0x0,
518 0xFF,
519 0xFF,
520 0xFF,
521 0xFF,
522 0x0,
523 0x0,
524 0x0,
525 0x0,
526 0xFF,
527 0xFF,
528 0xFF,
529 0xFF,
530 0x0,
531 0x0,
532 0x0,
533 0x0,
534 0xFF,
535 0xFF,
536 0xFF,
537 0xFF,
538 0x0,
539 0x0,
540 0x0,
541 0x0,
542 0xFF,
543 0xFF,
544 0xFF,
545 0xFF);
546 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
547 0x0,
548 0xFF,
549 0xFF,
550 0x0,
551 0x0,
552 0xFF,
553 0xFF,
554 0x0,
555 0x0,
556 0xFF,
557 0xFF,
558 0x0,
559 0x0,
560 0xFF,
561 0xFF,
562 0x0,
563 0x0,
564 0xFF,
565 0xFF,
566 0x0,
567 0x0,
568 0xFF,
569 0xFF,
570 0x0,
571 0x0,
572 0xFF,
573 0xFF,
574 0x0,
575 0x0,
576 0xFF,
577 0xFF);
578
579 for (branch = 0; branch < num_branches / 2; ++branch) {
580 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
581
582 // prefetch next chunk
583 temp_ptr += 32;
584 __VOLK_PREFETCH(temp_ptr);
585
586 // shuffle once for bit-reversal.
587 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
588
589 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
590 shifted = _mm256_and_si256(shifted, mask_stage4);
591 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
592
593
594 shifted = _mm256_srli_si256(r_frame0, 4);
595 shifted = _mm256_and_si256(shifted, mask_stage3);
596 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597
598 shifted = _mm256_srli_si256(r_frame0, 2);
599 shifted = _mm256_and_si256(shifted, mask_stage2);
600 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
601
602 shifted = _mm256_srli_si256(r_frame0, 1);
603 shifted = _mm256_and_si256(shifted, mask_stage1);
604 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
605
606 // store result of chunk.
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
608 frame_ptr += 32;
609 }
610}
611#endif /* LV_HAVE_AVX2 */
612
613#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
614
615#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
617
618#ifdef LV_HAVE_SSSE3
619#include <tmmintrin.h>
620
621static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
622 unsigned char* temp,
623 unsigned int frame_size)
624{
625 if (frame_size < 16) {
626 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
627 return;
628 }
629
630 const unsigned int po2 = log2_of_power_of_2(frame_size);
631
632 unsigned int stage = po2;
633 unsigned char* frame_ptr = frame;
634 unsigned char* temp_ptr = temp;
635
636 unsigned int frame_half = frame_size >> 1;
637 unsigned int num_branches = 1;
638 unsigned int branch;
639 unsigned int bit;
640
641 // prepare constants
642 const __m128i mask_stage1 = _mm_set_epi8(0x0,
643 0xFF,
644 0x0,
645 0xFF,
646 0x0,
647 0xFF,
648 0x0,
649 0xFF,
650 0x0,
651 0xFF,
652 0x0,
653 0xFF,
654 0x0,
655 0xFF,
656 0x0,
657 0xFF);
658
659 // get some SIMD registers to play with.
660 __m128i r_frame0, r_temp0, shifted;
661
662 {
663 __m128i r_frame1, r_temp1;
664 const __m128i shuffle_separate =
665 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
666
667 while (stage > 4) {
668 frame_ptr = frame;
669 temp_ptr = temp;
670
671 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
672 for (branch = 0; branch < num_branches; ++branch) {
673 for (bit = 0; bit < frame_half; bit += 16) {
674 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
675 temp_ptr += 16;
676 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
677 temp_ptr += 16;
678
679 shifted = _mm_srli_si128(r_temp0, 1);
680 shifted = _mm_and_si128(shifted, mask_stage1);
681 r_temp0 = _mm_xor_si128(shifted, r_temp0);
682 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
683
684 shifted = _mm_srli_si128(r_temp1, 1);
685 shifted = _mm_and_si128(shifted, mask_stage1);
686 r_temp1 = _mm_xor_si128(shifted, r_temp1);
687 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
688
689 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
690 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
691
692 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
693 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
694 frame_ptr += 16;
695 }
696
697 frame_ptr += frame_half;
698 }
699 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
700
701 num_branches = num_branches << 1;
702 frame_half = frame_half >> 1;
703 stage--;
704 }
705 }
706
707 // This last part requires at least 16-bit frames.
708 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
709
710 // reset pointers to correct positions.
711 frame_ptr = frame;
712 temp_ptr = temp;
713
714 // prefetch first chunk
715 __VOLK_PREFETCH(temp_ptr);
716
717 const __m128i shuffle_stage4 =
718 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
719 const __m128i mask_stage4 = _mm_set_epi8(0x0,
720 0x0,
721 0x0,
722 0x0,
723 0x0,
724 0x0,
725 0x0,
726 0x0,
727 0xFF,
728 0xFF,
729 0xFF,
730 0xFF,
731 0xFF,
732 0xFF,
733 0xFF,
734 0xFF);
735 const __m128i mask_stage3 = _mm_set_epi8(0x0,
736 0x0,
737 0x0,
738 0x0,
739 0xFF,
740 0xFF,
741 0xFF,
742 0xFF,
743 0x0,
744 0x0,
745 0x0,
746 0x0,
747 0xFF,
748 0xFF,
749 0xFF,
750 0xFF);
751 const __m128i mask_stage2 = _mm_set_epi8(0x0,
752 0x0,
753 0xFF,
754 0xFF,
755 0x0,
756 0x0,
757 0xFF,
758 0xFF,
759 0x0,
760 0x0,
761 0xFF,
762 0xFF,
763 0x0,
764 0x0,
765 0xFF,
766 0xFF);
767
768 for (branch = 0; branch < num_branches; ++branch) {
769 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
770
771 // prefetch next chunk
772 temp_ptr += 16;
773 __VOLK_PREFETCH(temp_ptr);
774
775 // shuffle once for bit-reversal.
776 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
777
778 shifted = _mm_srli_si128(r_temp0, 8);
779 shifted = _mm_and_si128(shifted, mask_stage4);
780 r_frame0 = _mm_xor_si128(shifted, r_temp0);
781
782 shifted = _mm_srli_si128(r_frame0, 4);
783 shifted = _mm_and_si128(shifted, mask_stage3);
784 r_frame0 = _mm_xor_si128(shifted, r_frame0);
785
786 shifted = _mm_srli_si128(r_frame0, 2);
787 shifted = _mm_and_si128(shifted, mask_stage2);
788 r_frame0 = _mm_xor_si128(shifted, r_frame0);
789
790 shifted = _mm_srli_si128(r_frame0, 1);
791 shifted = _mm_and_si128(shifted, mask_stage1);
792 r_frame0 = _mm_xor_si128(shifted, r_frame0);
793
794 // store result of chunk.
795 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
796 frame_ptr += 16;
797 }
798}
799#endif /* LV_HAVE_SSSE3 */
800
801#ifdef LV_HAVE_AVX2
802#include <immintrin.h>
803
804static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
805 unsigned char* temp,
806 unsigned int frame_size)
807{
808 if (frame_size < 32) {
809 volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
810 return;
811 }
812
813 const unsigned int po2 = log2_of_power_of_2(frame_size);
814
815 unsigned int stage = po2;
816 unsigned char* frame_ptr = frame;
817 unsigned char* temp_ptr = temp;
818
819 unsigned int frame_half = frame_size >> 1;
820 unsigned int num_branches = 1;
821 unsigned int branch;
822 unsigned int bit;
823
824 // prepare constants
825 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
826 0xFF,
827 0x0,
828 0xFF,
829 0x0,
830 0xFF,
831 0x0,
832 0xFF,
833 0x0,
834 0xFF,
835 0x0,
836 0xFF,
837 0x0,
838 0xFF,
839 0x0,
840 0xFF,
841 0x0,
842 0xFF,
843 0x0,
844 0xFF,
845 0x0,
846 0xFF,
847 0x0,
848 0xFF,
849 0x0,
850 0xFF,
851 0x0,
852 0xFF,
853 0x0,
854 0xFF,
855 0x0,
856 0xFF);
857
858 const __m128i mask_stage0 = _mm_set_epi8(0x0,
859 0xFF,
860 0x0,
861 0xFF,
862 0x0,
863 0xFF,
864 0x0,
865 0xFF,
866 0x0,
867 0xFF,
868 0x0,
869 0xFF,
870 0x0,
871 0xFF,
872 0x0,
873 0xFF);
874 // get some SIMD registers to play with.
875 __m256i r_frame0, r_temp0, shifted;
876 __m128i r_temp2, r_frame2, shifted2;
877 {
878 __m256i r_frame1, r_temp1;
879 __m128i r_frame3, r_temp3;
880 const __m256i shuffle_separate = _mm256_setr_epi8(0,
881 2,
882 4,
883 6,
884 8,
885 10,
886 12,
887 14,
888 1,
889 3,
890 5,
891 7,
892 9,
893 11,
894 13,
895 15,
896 0,
897 2,
898 4,
899 6,
900 8,
901 10,
902 12,
903 14,
904 1,
905 3,
906 5,
907 7,
908 9,
909 11,
910 13,
911 15);
912 const __m128i shuffle_separate128 =
913 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
914
915 while (stage > 4) {
916 frame_ptr = frame;
917 temp_ptr = temp;
918
919 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
920 for (branch = 0; branch < num_branches; ++branch) {
921 for (bit = 0; bit < frame_half; bit += 32) {
922 if ((frame_half - bit) <
923 32) // if only 16 bits remaining in frame, not 32
924 {
925 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
926 temp_ptr += 16;
927 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
928 temp_ptr += 16;
929
930 shifted2 = _mm_srli_si128(r_temp2, 1);
931 shifted2 = _mm_and_si128(shifted2, mask_stage0);
932 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
933 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
934
935 shifted2 = _mm_srli_si128(r_temp3, 1);
936 shifted2 = _mm_and_si128(shifted2, mask_stage0);
937 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
938 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
939
940 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
941 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
942
943 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
944 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
945 frame_ptr += 16;
946 break;
947 }
948 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
949 temp_ptr += 32;
950 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
951 temp_ptr += 32;
952
953 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
954 shifted = _mm256_and_si256(shifted, mask_stage1);
955 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
957
958 shifted = _mm256_srli_si256(r_temp1, 1);
959 shifted = _mm256_and_si256(shifted, mask_stage1);
960 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
962
963 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
967
968 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
969
970 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
971 frame_ptr += 32;
972 }
973
974 frame_ptr += frame_half;
975 }
976 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
977
978 num_branches = num_branches << 1;
979 frame_half = frame_half >> 1;
980 stage--;
981 }
982 }
983
984 // This last part requires at least 32-bit frames.
985 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
986
987 // reset pointers to correct positions.
988 frame_ptr = frame;
989 temp_ptr = temp;
990
991 // prefetch first chunk.
992 __VOLK_PREFETCH(temp_ptr);
993
994 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
995 8,
996 4,
997 12,
998 2,
999 10,
1000 6,
1001 14,
1002 1,
1003 9,
1004 5,
1005 13,
1006 3,
1007 11,
1008 7,
1009 15,
1010 0,
1011 8,
1012 4,
1013 12,
1014 2,
1015 10,
1016 6,
1017 14,
1018 1,
1019 9,
1020 5,
1021 13,
1022 3,
1023 11,
1024 7,
1025 15);
1026 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1027 0x0,
1028 0x0,
1029 0x0,
1030 0x0,
1031 0x0,
1032 0x0,
1033 0x0,
1034 0xFF,
1035 0xFF,
1036 0xFF,
1037 0xFF,
1038 0xFF,
1039 0xFF,
1040 0xFF,
1041 0xFF,
1042 0x0,
1043 0x0,
1044 0x0,
1045 0x0,
1046 0x0,
1047 0x0,
1048 0x0,
1049 0x0,
1050 0xFF,
1051 0xFF,
1052 0xFF,
1053 0xFF,
1054 0xFF,
1055 0xFF,
1056 0xFF,
1057 0xFF);
1058 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1059 0x0,
1060 0x0,
1061 0x0,
1062 0xFF,
1063 0xFF,
1064 0xFF,
1065 0xFF,
1066 0x0,
1067 0x0,
1068 0x0,
1069 0x0,
1070 0xFF,
1071 0xFF,
1072 0xFF,
1073 0xFF,
1074 0x0,
1075 0x0,
1076 0x0,
1077 0x0,
1078 0xFF,
1079 0xFF,
1080 0xFF,
1081 0xFF,
1082 0x0,
1083 0x0,
1084 0x0,
1085 0x0,
1086 0xFF,
1087 0xFF,
1088 0xFF,
1089 0xFF);
1090 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1091 0x0,
1092 0xFF,
1093 0xFF,
1094 0x0,
1095 0x0,
1096 0xFF,
1097 0xFF,
1098 0x0,
1099 0x0,
1100 0xFF,
1101 0xFF,
1102 0x0,
1103 0x0,
1104 0xFF,
1105 0xFF,
1106 0x0,
1107 0x0,
1108 0xFF,
1109 0xFF,
1110 0x0,
1111 0x0,
1112 0xFF,
1113 0xFF,
1114 0x0,
1115 0x0,
1116 0xFF,
1117 0xFF,
1118 0x0,
1119 0x0,
1120 0xFF,
1121 0xFF);
1122
1123 for (branch = 0; branch < num_branches / 2; ++branch) {
1124 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1125
1126 // prefetch next chunk
1127 temp_ptr += 32;
1128 __VOLK_PREFETCH(temp_ptr);
1129
1130 // shuffle once for bit-reversal.
1131 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1132
1133 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1134 shifted = _mm256_and_si256(shifted, mask_stage4);
1135 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1136
1137 shifted = _mm256_srli_si256(r_frame0, 4);
1138 shifted = _mm256_and_si256(shifted, mask_stage3);
1139 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1140
1141 shifted = _mm256_srli_si256(r_frame0, 2);
1142 shifted = _mm256_and_si256(shifted, mask_stage2);
1143 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1144
1145 shifted = _mm256_srli_si256(r_frame0, 1);
1146 shifted = _mm256_and_si256(shifted, mask_stage1);
1147 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1148
1149 // store result of chunk.
1150 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1151 frame_ptr += 32;
1152 }
1153}
1154#endif /* LV_HAVE_AVX2 */
1155
1156#ifdef LV_HAVE_RVV
1157#include <riscv_vector.h>
1158
1159static inline void volk_8u_x2_encodeframepolar_8u_rvv(unsigned char* frame,
1160 unsigned char* temp,
1161 unsigned int frame_size)
1162{
1163 unsigned int stage = log2_of_power_of_2(frame_size);
1164 unsigned int frame_half = frame_size >> 1;
1165 unsigned int num_branches = 1;
1166
1167 while (stage) {
1168 // encode stage
1169 if (frame_half < 8) {
1170 encodepolar_single_stage(frame, temp, num_branches, frame_half);
1171 } else {
1172 unsigned char *in = temp, *out = frame;
1173 for (size_t branch = 0; branch < num_branches; ++branch) {
1174 size_t n = frame_half;
1175 for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
1176 vl = __riscv_vsetvl_e8m1(n);
1177 vuint16m2_t vc = __riscv_vle16_v_u16m2((uint16_t*)in, vl);
1178 vuint8m1_t v1 = __riscv_vnsrl(vc, 0, vl);
1179 vuint8m1_t v2 = __riscv_vnsrl(vc, 8, vl);
1180 __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
1181 __riscv_vse8(out + frame_half, v2, vl);
1182 }
1183 out += frame_half;
1184 }
1185 }
1186 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
1187
1188 // update all the parameters.
1189 num_branches = num_branches << 1;
1190 frame_half = frame_half >> 1;
1191 --stage;
1192 }
1193}
1194#endif /*LV_HAVE_RVV*/
1195
1196#ifdef LV_HAVE_RVVSEG
1197#include <riscv_vector.h>
1198
1199static inline void volk_8u_x2_encodeframepolar_8u_rvvseg(unsigned char* frame,
1200 unsigned char* temp,
1201 unsigned int frame_size)
1202{
1203 unsigned int stage = log2_of_power_of_2(frame_size);
1204 unsigned int frame_half = frame_size >> 1;
1205 unsigned int num_branches = 1;
1206
1207 while (stage) {
1208 // encode stage
1209 if (frame_half < 8) {
1210 encodepolar_single_stage(frame, temp, num_branches, frame_half);
1211 } else {
1212 unsigned char *in = temp, *out = frame;
1213 for (size_t branch = 0; branch < num_branches; ++branch) {
1214 size_t n = frame_half;
1215 for (size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
1216 vl = __riscv_vsetvl_e8m1(n);
1217 vuint8m1x2_t vc = __riscv_vlseg2e8_v_u8m1x2(in, vl);
1218 vuint8m1_t v1 = __riscv_vget_u8m1(vc, 0);
1219 vuint8m1_t v2 = __riscv_vget_u8m1(vc, 1);
1220 __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
1221 __riscv_vse8(out + frame_half, v2, vl);
1222 }
1223 out += frame_half;
1224 }
1225 }
1226 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
1227
1228 // update all the parameters.
1229 num_branches = num_branches << 1;
1230 frame_half = frame_half >> 1;
1231 --stage;
1232 }
1233}
1234#endif /*LV_HAVE_RVVSEG*/
1235
1236#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:621
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68