44#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
55volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(
lv_32fc_t* cVector,
59 unsigned int num_points)
61 unsigned int number = 0;
62 const unsigned int oneEigthPoints = num_points / 8;
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
69 __m256i conjugateSign =
70 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
74 for (; number < oneEigthPoints; number++) {
76 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
80 realz = _mm256_madd_epi16(x, y);
83 y = _mm256_sign_epi16(y, conjugateSign);
86 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
90 imagz = _mm256_madd_epi16(x, y);
93 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
96 retlo = _mm256_mul_ps(retlo, invScalar);
99 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
102 rethi = _mm256_mul_ps(rethi, invScalar);
104 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((
float*)c, ret);
108 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((
float*)c, ret);
116 number = oneEigthPoints * 8;
117 float* cFloatPtr = (
float*)&cVector[number];
118 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 int8_t* b8Ptr = (int8_t*)&bVector[number];
120 for (; number < num_points; number++) {
121 float aReal = (float)*a8Ptr++;
122 float aImag = (float)*a8Ptr++;
124 float bReal = (float)*b8Ptr++;
125 float bImag = (float)*b8Ptr++;
129 *cFloatPtr++ =
lv_creal(temp) / scalar;
130 *cFloatPtr++ =
lv_cimag(temp) / scalar;
137#include <smmintrin.h>
140volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(
lv_32fc_t* cVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
149 __m128i x, y, realz, imagz;
154 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
156 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
158 for (; number < quarterPoints; number++) {
160 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
164 realz = _mm_madd_epi16(x, y);
167 y = _mm_sign_epi16(y, conjugateSign);
170 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171 _MM_SHUFFLE(2, 3, 0, 1));
174 imagz = _mm_madd_epi16(x, y);
177 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
180 ret = _mm_mul_ps(ret, invScalar);
183 _mm_store_ps((
float*)c, ret);
187 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
190 ret = _mm_mul_ps(ret, invScalar);
193 _mm_store_ps((
float*)c, ret);
200 number = quarterPoints * 4;
201 float* cFloatPtr = (
float*)&cVector[number];
202 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 int8_t* b8Ptr = (int8_t*)&bVector[number];
204 for (; number < num_points; number++) {
205 float aReal = (float)*a8Ptr++;
206 float aImag = (float)*a8Ptr++;
208 float bReal = (float)*b8Ptr++;
209 float bImag = (float)*b8Ptr++;
213 *cFloatPtr++ =
lv_creal(temp) / scalar;
214 *cFloatPtr++ =
lv_cimag(temp) / scalar;
220#ifdef LV_HAVE_GENERIC
227 unsigned int num_points)
229 unsigned int number = 0;
230 float* cPtr = (
float*)cVector;
231 const float invScalar = 1.0 / scalar;
232 int8_t* a8Ptr = (int8_t*)aVector;
233 int8_t* b8Ptr = (int8_t*)bVector;
234 for (number = 0; number < num_points; number++) {
235 float aReal = (float)*a8Ptr++;
236 float aImag = (float)*a8Ptr++;
238 float bReal = (float)*b8Ptr++;
239 float bImag = (float)*b8Ptr++;
243 *cPtr++ = (
lv_creal(temp) * invScalar);
244 *cPtr++ = (
lv_cimag(temp) * invScalar);
252#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
260#include <immintrin.h>
263volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(
lv_32fc_t* cVector,
267 unsigned int num_points)
269 unsigned int number = 0;
270 const unsigned int oneEigthPoints = num_points / 8;
272 __m256i x, y, realz, imagz;
273 __m256 ret, retlo, rethi;
277 __m256i conjugateSign =
278 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
280 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
282 for (; number < oneEigthPoints; number++) {
284 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
288 realz = _mm256_madd_epi16(x, y);
291 y = _mm256_sign_epi16(y, conjugateSign);
294 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295 _MM_SHUFFLE(2, 3, 0, 1));
298 imagz = _mm256_madd_epi16(x, y);
301 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
304 retlo = _mm256_mul_ps(retlo, invScalar);
307 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
310 rethi = _mm256_mul_ps(rethi, invScalar);
312 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313 _mm256_storeu_ps((
float*)c, ret);
316 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317 _mm256_storeu_ps((
float*)c, ret);
324 number = oneEigthPoints * 8;
325 float* cFloatPtr = (
float*)&cVector[number];
326 int8_t* a8Ptr = (int8_t*)&aVector[number];
327 int8_t* b8Ptr = (int8_t*)&bVector[number];
328 for (; number < num_points; number++) {
329 float aReal = (float)*a8Ptr++;
330 float aImag = (float)*a8Ptr++;
332 float bReal = (float)*b8Ptr++;
333 float bImag = (float)*b8Ptr++;
337 *cFloatPtr++ =
lv_creal(temp) / scalar;
338 *cFloatPtr++ =
lv_cimag(temp) / scalar;
345#include <riscv_vector.h>
347static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(
lv_32fc_t* cVector,
351 unsigned int num_points)
353 size_t n = num_points;
354 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
355 vl = __riscv_vsetvl_e8m1(n);
356 vint16m2_t va = __riscv_vle16_v_i16m2((
const int16_t*)aVector, vl);
357 vint16m2_t vb = __riscv_vle16_v_i16m2((
const int16_t*)bVector, vl);
358 vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
359 vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
360 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
362 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
363 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
364 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
365 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
366 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
368 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
369 __riscv_vse64((uint64_t*)cVector, v, vl);
375#include <riscv_vector.h>
378volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(
lv_32fc_t* cVector,
382 unsigned int num_points)
384 size_t n = num_points;
385 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
386 vl = __riscv_vsetvl_e8m1(n);
387 vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((
const int8_t*)aVector, vl);
388 vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((
const int8_t*)bVector, vl);
389 vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
390 vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
391 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
393 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
394 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
395 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
396 __riscv_vsseg2e32_v_f32m4x2(
397 (
float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition volk_8ic_x2_s32f_multiply_conjugate_32fc.h:223
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition volk_complex.h:70
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74