10#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
28static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(
lv_16sc_t* cVector,
31 unsigned int num_points)
33 unsigned int number = 0;
34 const unsigned int quarterPoints = num_points / 8;
36 __m256i x, y, realz, imagz;
40 __m256i conjugateSign =
41 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
43 for (; number < quarterPoints; number++) {
45 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
46 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
49 realz = _mm256_madd_epi16(x, y);
52 y = _mm256_sign_epi16(y, conjugateSign);
55 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
56 _MM_SHUFFLE(2, 3, 0, 1));
59 imagz = _mm256_madd_epi16(x, y);
63 _mm256_store_si256((__m256i*)c,
64 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
65 _mm256_unpackhi_epi32(realz, imagz)));
72 number = quarterPoints * 8;
73 int16_t* c16Ptr = (int16_t*)&cVector[number];
74 int8_t* a8Ptr = (int8_t*)&aVector[number];
75 int8_t* b8Ptr = (int8_t*)&bVector[number];
76 for (; number < num_points; number++) {
77 float aReal = (float)*a8Ptr++;
78 float aImag = (float)*a8Ptr++;
80 float bReal = (float)*b8Ptr++;
81 float bImag = (float)*b8Ptr++;
85 *c16Ptr++ = (int16_t)(
lv_creal(temp) > SHRT_MAX ? SHRT_MAX :
lv_creal(temp));
102static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(
lv_16sc_t* cVector,
105 unsigned int num_points)
107 unsigned int number = 0;
108 const unsigned int quarterPoints = num_points / 4;
110 __m128i x, y, realz, imagz;
114 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
116 for (; number < quarterPoints; number++) {
118 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
119 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
122 realz = _mm_madd_epi16(x, y);
125 y = _mm_sign_epi16(y, conjugateSign);
128 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
129 _MM_SHUFFLE(2, 3, 0, 1));
132 imagz = _mm_madd_epi16(x, y);
134 _mm_store_si128((__m128i*)c,
135 _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
136 _mm_unpackhi_epi32(realz, imagz)));
143 number = quarterPoints * 4;
144 int16_t* c16Ptr = (int16_t*)&cVector[number];
145 int8_t* a8Ptr = (int8_t*)&aVector[number];
146 int8_t* b8Ptr = (int8_t*)&bVector[number];
147 for (; number < num_points; number++) {
148 float aReal = (float)*a8Ptr++;
149 float aImag = (float)*a8Ptr++;
151 float bReal = (float)*b8Ptr++;
152 float bImag = (float)*b8Ptr++;
156 *c16Ptr++ = (int16_t)(
lv_creal(temp) > SHRT_MAX ? SHRT_MAX :
lv_creal(temp));
157 *c16Ptr++ = (int16_t)
lv_cimag(temp);
162#ifdef LV_HAVE_GENERIC
174 unsigned int num_points)
176 unsigned int number = 0;
177 int16_t* c16Ptr = (int16_t*)cVector;
178 int8_t* a8Ptr = (int8_t*)aVector;
179 int8_t* b8Ptr = (int8_t*)bVector;
180 for (number = 0; number < num_points; number++) {
181 float aReal = (float)*a8Ptr++;
182 float aImag = (float)*a8Ptr++;
184 float bReal = (float)*b8Ptr++;
185 float bImag = (float)*b8Ptr++;
189 *c16Ptr++ = (int16_t)(
lv_creal(temp) > SHRT_MAX ? SHRT_MAX :
lv_creal(temp));
190 *c16Ptr++ = (int16_t)
lv_cimag(temp);
197#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
198#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
205#include <immintrin.h>
214static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(
lv_16sc_t* cVector,
217 unsigned int num_points)
219 unsigned int number = 0;
220 const unsigned int oneEigthPoints = num_points / 8;
222 __m256i x, y, realz, imagz;
226 __m256i conjugateSign =
227 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
229 for (; number < oneEigthPoints; number++) {
231 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
232 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
235 realz = _mm256_madd_epi16(x, y);
238 y = _mm256_sign_epi16(y, conjugateSign);
241 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
242 _MM_SHUFFLE(2, 3, 0, 1));
245 imagz = _mm256_madd_epi16(x, y);
249 _mm256_storeu_si256((__m256i*)c,
250 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
251 _mm256_unpackhi_epi32(realz, imagz)));
258 number = oneEigthPoints * 8;
259 int16_t* c16Ptr = (int16_t*)&cVector[number];
260 int8_t* a8Ptr = (int8_t*)&aVector[number];
261 int8_t* b8Ptr = (int8_t*)&bVector[number];
262 for (; number < num_points; number++) {
263 float aReal = (float)*a8Ptr++;
264 float aImag = (float)*a8Ptr++;
266 float bReal = (float)*b8Ptr++;
267 float bImag = (float)*b8Ptr++;
271 *c16Ptr++ = (int16_t)(
lv_creal(temp) > SHRT_MAX ? SHRT_MAX :
lv_creal(temp));
272 *c16Ptr++ = (int16_t)
lv_cimag(temp);
278#include <riscv_vector.h>
280static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(
lv_16sc_t* cVector,
283 unsigned int num_points)
285 size_t n = num_points;
286 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
287 vl = __riscv_vsetvl_e8m2(n);
288 vint16m4_t va = __riscv_vle16_v_i16m4((
const int16_t*)aVector, vl);
289 vint16m4_t vb = __riscv_vle16_v_i16m4((
const int16_t*)bVector, vl);
290 vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
291 vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
292 vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
294 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
295 vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
296 vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
297 vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
298 __riscv_vse32((uint32_t*)cVector, v, vl);
304#include <riscv_vector.h>
306static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(
lv_16sc_t* cVector,
309 unsigned int num_points)
311 size_t n = num_points;
312 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
313 vl = __riscv_vsetvl_e8m2(n);
314 vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((
const int8_t*)aVector, vl);
315 vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((
const int8_t*)bVector, vl);
316 vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1);
317 vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1);
318 vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
320 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
321 __riscv_vsseg2e16_v_i16m4x2(
322 (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
static void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, unsigned int num_points)
Multiplys the one complex vector with the complex conjugate of the second complex vector and stores t...
Definition volk_8ic_x2_multiply_conjugate_16ic.h:171
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition volk_complex.h:70
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74
short complex lv_16sc_t
Definition volk_complex.h:71