14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (val & b[0]) != 0;
26 res |= ((val & b[4]) != 0) << 4;
27 res |= ((val & b[3]) != 0) << 3;
28 res |= ((val & b[2]) != 0) << 2;
29 res |= ((val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
54 unsigned int frame_size)
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
63 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
78 unsigned int frame_size)
80 if (frame_size < 16) {
87 unsigned int stage = po2;
88 unsigned char* frame_ptr = frame;
89 unsigned char* temp_ptr = temp;
91 unsigned int frame_half = frame_size >> 1;
92 unsigned int num_branches = 1;
97 const __m128i mask_stage1 = _mm_set_epi8(0x0,
115 __m128i r_frame0, r_temp0, shifted;
118 __m128i r_frame1, r_temp1;
119 const __m128i shuffle_separate =
120 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
127 for (branch = 0; branch < num_branches; ++branch) {
128 for (bit = 0; bit < frame_half; bit += 16) {
129 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
131 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
134 shifted = _mm_srli_si128(r_temp0, 1);
135 shifted = _mm_and_si128(shifted, mask_stage1);
136 r_temp0 = _mm_xor_si128(shifted, r_temp0);
137 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
139 shifted = _mm_srli_si128(r_temp1, 1);
140 shifted = _mm_and_si128(shifted, mask_stage1);
141 r_temp1 = _mm_xor_si128(shifted, r_temp1);
142 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
144 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
145 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
147 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
148 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
152 frame_ptr += frame_half;
154 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
156 num_branches = num_branches << 1;
157 frame_half = frame_half >> 1;
172 const __m128i shuffle_stage4 =
173 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
174 const __m128i mask_stage4 = _mm_set_epi8(0x0,
190 const __m128i mask_stage3 = _mm_set_epi8(0x0,
206 const __m128i mask_stage2 = _mm_set_epi8(0x0,
223 for (branch = 0; branch < num_branches; ++branch) {
224 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
231 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
233 shifted = _mm_srli_si128(r_temp0, 8);
234 shifted = _mm_and_si128(shifted, mask_stage4);
235 r_frame0 = _mm_xor_si128(shifted, r_temp0);
237 shifted = _mm_srli_si128(r_frame0, 4);
238 shifted = _mm_and_si128(shifted, mask_stage3);
239 r_frame0 = _mm_xor_si128(shifted, r_frame0);
241 shifted = _mm_srli_si128(r_frame0, 2);
242 shifted = _mm_and_si128(shifted, mask_stage2);
243 r_frame0 = _mm_xor_si128(shifted, r_frame0);
245 shifted = _mm_srli_si128(r_frame0, 1);
246 shifted = _mm_and_si128(shifted, mask_stage1);
247 r_frame0 = _mm_xor_si128(shifted, r_frame0);
250 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
258#include <immintrin.h>
260static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
262 unsigned int frame_size)
264 if (frame_size < 32) {
271 unsigned int stage = po2;
272 unsigned char* frame_ptr = frame;
273 unsigned char* temp_ptr = temp;
275 unsigned int frame_half = frame_size >> 1;
276 unsigned int num_branches = 1;
281 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
314 const __m128i mask_stage0 = _mm_set_epi8(0x0,
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
334 __m256i r_frame1, r_temp1;
335 __m128i r_frame3, r_temp3;
336 const __m256i shuffle_separate = _mm256_setr_epi8(0,
368 const __m128i shuffle_separate128 =
369 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
376 for (branch = 0; branch < num_branches; ++branch) {
377 for (bit = 0; bit < frame_half; bit += 32) {
378 if ((frame_half - bit) <
381 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
383 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
386 shifted2 = _mm_srli_si128(r_temp2, 1);
387 shifted2 = _mm_and_si128(shifted2, mask_stage0);
388 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
391 shifted2 = _mm_srli_si128(r_temp3, 1);
392 shifted2 = _mm_and_si128(shifted2, mask_stage0);
393 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
396 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
399 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
404 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
406 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
409 shifted = _mm256_srli_si256(r_temp0, 1);
410 shifted = _mm256_and_si256(shifted, mask_stage1);
411 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
414 shifted = _mm256_srli_si256(r_temp1, 1);
415 shifted = _mm256_and_si256(shifted, mask_stage1);
416 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
419 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
426 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430 frame_ptr += frame_half;
432 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
434 num_branches = num_branches << 1;
435 frame_half = frame_half >> 1;
450 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
482 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
514 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
546 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
579 for (branch = 0; branch < num_branches / 2; ++branch) {
580 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
587 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
589 shifted = _mm256_srli_si256(r_temp0, 8);
590 shifted = _mm256_and_si256(shifted, mask_stage4);
591 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
594 shifted = _mm256_srli_si256(r_frame0, 4);
595 shifted = _mm256_and_si256(shifted, mask_stage3);
596 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
598 shifted = _mm256_srli_si256(r_frame0, 2);
599 shifted = _mm256_and_si256(shifted, mask_stage2);
600 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
602 shifted = _mm256_srli_si256(r_frame0, 1);
603 shifted = _mm256_and_si256(shifted, mask_stage1);
604 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
615#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619#include <tmmintrin.h>
623 unsigned int frame_size)
625 if (frame_size < 16) {
632 unsigned int stage = po2;
633 unsigned char* frame_ptr = frame;
634 unsigned char* temp_ptr = temp;
636 unsigned int frame_half = frame_size >> 1;
637 unsigned int num_branches = 1;
642 const __m128i mask_stage1 = _mm_set_epi8(0x0,
660 __m128i r_frame0, r_temp0, shifted;
663 __m128i r_frame1, r_temp1;
664 const __m128i shuffle_separate =
665 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
672 for (branch = 0; branch < num_branches; ++branch) {
673 for (bit = 0; bit < frame_half; bit += 16) {
674 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
676 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
679 shifted = _mm_srli_si128(r_temp0, 1);
680 shifted = _mm_and_si128(shifted, mask_stage1);
681 r_temp0 = _mm_xor_si128(shifted, r_temp0);
682 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
684 shifted = _mm_srli_si128(r_temp1, 1);
685 shifted = _mm_and_si128(shifted, mask_stage1);
686 r_temp1 = _mm_xor_si128(shifted, r_temp1);
687 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
689 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
690 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
692 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
693 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
697 frame_ptr += frame_half;
699 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
701 num_branches = num_branches << 1;
702 frame_half = frame_half >> 1;
717 const __m128i shuffle_stage4 =
718 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
719 const __m128i mask_stage4 = _mm_set_epi8(0x0,
735 const __m128i mask_stage3 = _mm_set_epi8(0x0,
751 const __m128i mask_stage2 = _mm_set_epi8(0x0,
768 for (branch = 0; branch < num_branches; ++branch) {
769 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
776 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
778 shifted = _mm_srli_si128(r_temp0, 8);
779 shifted = _mm_and_si128(shifted, mask_stage4);
780 r_frame0 = _mm_xor_si128(shifted, r_temp0);
782 shifted = _mm_srli_si128(r_frame0, 4);
783 shifted = _mm_and_si128(shifted, mask_stage3);
784 r_frame0 = _mm_xor_si128(shifted, r_frame0);
786 shifted = _mm_srli_si128(r_frame0, 2);
787 shifted = _mm_and_si128(shifted, mask_stage2);
788 r_frame0 = _mm_xor_si128(shifted, r_frame0);
790 shifted = _mm_srli_si128(r_frame0, 1);
791 shifted = _mm_and_si128(shifted, mask_stage1);
792 r_frame0 = _mm_xor_si128(shifted, r_frame0);
795 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
802#include <immintrin.h>
804static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
806 unsigned int frame_size)
808 if (frame_size < 32) {
815 unsigned int stage = po2;
816 unsigned char* frame_ptr = frame;
817 unsigned char* temp_ptr = temp;
819 unsigned int frame_half = frame_size >> 1;
820 unsigned int num_branches = 1;
825 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
858 const __m128i mask_stage0 = _mm_set_epi8(0x0,
875 __m256i r_frame0, r_temp0, shifted;
876 __m128i r_temp2, r_frame2, shifted2;
878 __m256i r_frame1, r_temp1;
879 __m128i r_frame3, r_temp3;
880 const __m256i shuffle_separate = _mm256_setr_epi8(0,
912 const __m128i shuffle_separate128 =
913 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
920 for (branch = 0; branch < num_branches; ++branch) {
921 for (bit = 0; bit < frame_half; bit += 32) {
922 if ((frame_half - bit) <
925 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
927 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
930 shifted2 = _mm_srli_si128(r_temp2, 1);
931 shifted2 = _mm_and_si128(shifted2, mask_stage0);
932 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
933 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
935 shifted2 = _mm_srli_si128(r_temp3, 1);
936 shifted2 = _mm_and_si128(shifted2, mask_stage0);
937 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
938 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
940 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
941 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
943 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
944 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
948 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
950 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
953 shifted = _mm256_srli_si256(r_temp0, 1);
954 shifted = _mm256_and_si256(shifted, mask_stage1);
955 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
958 shifted = _mm256_srli_si256(r_temp1, 1);
959 shifted = _mm256_and_si256(shifted, mask_stage1);
960 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
963 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
968 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
970 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
974 frame_ptr += frame_half;
976 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
978 num_branches = num_branches << 1;
979 frame_half = frame_half >> 1;
994 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1026 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1058 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1090 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1123 for (branch = 0; branch < num_branches / 2; ++branch) {
1124 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1131 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1133 shifted = _mm256_srli_si256(r_temp0, 8);
1134 shifted = _mm256_and_si256(shifted, mask_stage4);
1135 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1137 shifted = _mm256_srli_si256(r_frame0, 4);
1138 shifted = _mm256_and_si256(shifted, mask_stage3);
1139 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1141 shifted = _mm256_srli_si256(r_frame0, 2);
1142 shifted = _mm256_and_si256(shifted, mask_stage2);
1143 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1145 shifted = _mm256_srli_si256(r_frame0, 1);
1146 shifted = _mm256_and_si256(shifted, mask_stage1);
1147 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1150 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1157#include <riscv_vector.h>
1159static inline void volk_8u_x2_encodeframepolar_8u_rvv(
unsigned char* frame,
1160 unsigned char* temp,
1161 unsigned int frame_size)
1164 unsigned int frame_half = frame_size >> 1;
1165 unsigned int num_branches = 1;
1169 if (frame_half < 8) {
1172 unsigned char *in = temp, *out = frame;
1173 for (
size_t branch = 0; branch < num_branches; ++branch) {
1174 size_t n = frame_half;
1175 for (
size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
1176 vl = __riscv_vsetvl_e8m1(n);
1177 vuint16m2_t vc = __riscv_vle16_v_u16m2((uint16_t*)in, vl);
1178 vuint8m1_t v1 = __riscv_vnsrl(vc, 0, vl);
1179 vuint8m1_t v2 = __riscv_vnsrl(vc, 8, vl);
1180 __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
1181 __riscv_vse8(out + frame_half, v2, vl);
1186 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
1189 num_branches = num_branches << 1;
1190 frame_half = frame_half >> 1;
1196#ifdef LV_HAVE_RVVSEG
1197#include <riscv_vector.h>
1199static inline void volk_8u_x2_encodeframepolar_8u_rvvseg(
unsigned char* frame,
1200 unsigned char* temp,
1201 unsigned int frame_size)
1204 unsigned int frame_half = frame_size >> 1;
1205 unsigned int num_branches = 1;
1209 if (frame_half < 8) {
1212 unsigned char *in = temp, *out = frame;
1213 for (
size_t branch = 0; branch < num_branches; ++branch) {
1214 size_t n = frame_half;
1215 for (
size_t vl; n > 0; n -= vl, in += vl * 2, out += vl) {
1216 vl = __riscv_vsetvl_e8m1(n);
1217 vuint8m1x2_t vc = __riscv_vlseg2e8_v_u8m1x2(in, vl);
1218 vuint8m1_t v1 = __riscv_vget_u8m1(vc, 0);
1219 vuint8m1_t v2 = __riscv_vget_u8m1(vc, 1);
1220 __riscv_vse8(out, __riscv_vxor(v1, v2, vl), vl);
1221 __riscv_vse8(out + frame_half, v2, vl);
1226 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
1229 num_branches = num_branches << 1;
1230 frame_half = frame_half >> 1;
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:621
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68