Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46
47#include <inttypes.h>
48#include <stdio.h>
49#include <volk/volk_complex.h>
50
51#ifdef LV_HAVE_AVX2
52#include <immintrin.h>
53
54static inline void
55volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56 const lv_8sc_t* aVector,
57 const lv_8sc_t* bVector,
58 const float scalar,
59 unsigned int num_points)
60{
61 unsigned int number = 0;
62 const unsigned int oneEigthPoints = num_points / 8;
63
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
66 lv_32fc_t* c = cVector;
67 const lv_8sc_t* a = aVector;
68 const lv_8sc_t* b = bVector;
69 __m256i conjugateSign =
70 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73
74 for (; number < oneEigthPoints; number++) {
75 // Convert 8 bit values into 16 bit values
76 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78
79 // Calculate the ar*cr - ai*(-ci) portions
80 realz = _mm256_madd_epi16(x, y);
81
82 // Calculate the complex conjugate of the cr + ci j values
83 y = _mm256_sign_epi16(y, conjugateSign);
84
85 // Shift the order of the cr and ci values
86 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Calculate the ar*(-ci) + cr*(ai)
90 imagz = _mm256_madd_epi16(x, y);
91
92 // Interleave real and imaginary and then convert to float values
93 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94
95 // Normalize the floating point values
96 retlo = _mm256_mul_ps(retlo, invScalar);
97
98 // Interleave real and imaginary and then convert to float values
99 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100
101 // Normalize the floating point values
102 rethi = _mm256_mul_ps(rethi, invScalar);
103
104 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((float*)c, ret);
106 c += 4;
107
108 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((float*)c, ret);
110 c += 4;
111
112 a += 8;
113 b += 8;
114 }
115
116 number = oneEigthPoints * 8;
117 float* cFloatPtr = (float*)&cVector[number];
118 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 int8_t* b8Ptr = (int8_t*)&bVector[number];
120 for (; number < num_points; number++) {
121 float aReal = (float)*a8Ptr++;
122 float aImag = (float)*a8Ptr++;
123 lv_32fc_t aVal = lv_cmake(aReal, aImag);
124 float bReal = (float)*b8Ptr++;
125 float bImag = (float)*b8Ptr++;
126 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127 lv_32fc_t temp = aVal * bVal;
128
129 *cFloatPtr++ = lv_creal(temp) / scalar;
130 *cFloatPtr++ = lv_cimag(temp) / scalar;
131 }
132}
133#endif /* LV_HAVE_AVX2*/
134
135
136#ifdef LV_HAVE_SSE4_1
137#include <smmintrin.h>
138
139static inline void
140volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141 const lv_8sc_t* aVector,
142 const lv_8sc_t* bVector,
143 const float scalar,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 __m128i x, y, realz, imagz;
150 __m128 ret;
151 lv_32fc_t* c = cVector;
152 const lv_8sc_t* a = aVector;
153 const lv_8sc_t* b = bVector;
154 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155
156 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157
158 for (; number < quarterPoints; number++) {
159 // Convert into 8 bit values into 16 bit values
160 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162
163 // Calculate the ar*cr - ai*(-ci) portions
164 realz = _mm_madd_epi16(x, y);
165
166 // Calculate the complex conjugate of the cr + ci j values
167 y = _mm_sign_epi16(y, conjugateSign);
168
169 // Shift the order of the cr and ci values
170 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171 _MM_SHUFFLE(2, 3, 0, 1));
172
173 // Calculate the ar*(-ci) + cr*(ai)
174 imagz = _mm_madd_epi16(x, y);
175
176 // Interleave real and imaginary and then convert to float values
177 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178
179 // Normalize the floating point values
180 ret = _mm_mul_ps(ret, invScalar);
181
182 // Store the floating point values
183 _mm_store_ps((float*)c, ret);
184 c += 2;
185
186 // Interleave real and imaginary and then convert to float values
187 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188
189 // Normalize the floating point values
190 ret = _mm_mul_ps(ret, invScalar);
191
192 // Store the floating point values
193 _mm_store_ps((float*)c, ret);
194 c += 2;
195
196 a += 4;
197 b += 4;
198 }
199
200 number = quarterPoints * 4;
201 float* cFloatPtr = (float*)&cVector[number];
202 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 int8_t* b8Ptr = (int8_t*)&bVector[number];
204 for (; number < num_points; number++) {
205 float aReal = (float)*a8Ptr++;
206 float aImag = (float)*a8Ptr++;
207 lv_32fc_t aVal = lv_cmake(aReal, aImag);
208 float bReal = (float)*b8Ptr++;
209 float bImag = (float)*b8Ptr++;
210 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211 lv_32fc_t temp = aVal * bVal;
212
213 *cFloatPtr++ = lv_creal(temp) / scalar;
214 *cFloatPtr++ = lv_cimag(temp) / scalar;
215 }
216}
217#endif /* LV_HAVE_SSE4_1 */
218
219
220#ifdef LV_HAVE_GENERIC
221
222static inline void
224 const lv_8sc_t* aVector,
225 const lv_8sc_t* bVector,
226 const float scalar,
227 unsigned int num_points)
228{
229 unsigned int number = 0;
230 float* cPtr = (float*)cVector;
231 const float invScalar = 1.0 / scalar;
232 int8_t* a8Ptr = (int8_t*)aVector;
233 int8_t* b8Ptr = (int8_t*)bVector;
234 for (number = 0; number < num_points; number++) {
235 float aReal = (float)*a8Ptr++;
236 float aImag = (float)*a8Ptr++;
237 lv_32fc_t aVal = lv_cmake(aReal, aImag);
238 float bReal = (float)*b8Ptr++;
239 float bImag = (float)*b8Ptr++;
240 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241 lv_32fc_t temp = aVal * bVal;
242
243 *cPtr++ = (lv_creal(temp) * invScalar);
244 *cPtr++ = (lv_cimag(temp) * invScalar);
245 }
246}
247#endif /* LV_HAVE_GENERIC */
248
249
250#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251
252#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254
255#include <inttypes.h>
256#include <stdio.h>
257#include <volk/volk_complex.h>
258
259#ifdef LV_HAVE_AVX2
260#include <immintrin.h>
261
262static inline void
263volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264 const lv_8sc_t* aVector,
265 const lv_8sc_t* bVector,
266 const float scalar,
267 unsigned int num_points)
268{
269 unsigned int number = 0;
270 const unsigned int oneEigthPoints = num_points / 8;
271
272 __m256i x, y, realz, imagz;
273 __m256 ret, retlo, rethi;
274 lv_32fc_t* c = cVector;
275 const lv_8sc_t* a = aVector;
276 const lv_8sc_t* b = bVector;
277 __m256i conjugateSign =
278 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279
280 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281
282 for (; number < oneEigthPoints; number++) {
283 // Convert 8 bit values into 16 bit values
284 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286
287 // Calculate the ar*cr - ai*(-ci) portions
288 realz = _mm256_madd_epi16(x, y);
289
290 // Calculate the complex conjugate of the cr + ci j values
291 y = _mm256_sign_epi16(y, conjugateSign);
292
293 // Shift the order of the cr and ci values
294 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295 _MM_SHUFFLE(2, 3, 0, 1));
296
297 // Calculate the ar*(-ci) + cr*(ai)
298 imagz = _mm256_madd_epi16(x, y);
299
300 // Interleave real and imaginary and then convert to float values
301 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302
303 // Normalize the floating point values
304 retlo = _mm256_mul_ps(retlo, invScalar);
305
306 // Interleave real and imaginary and then convert to float values
307 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308
309 // Normalize the floating point values
310 rethi = _mm256_mul_ps(rethi, invScalar);
311
312 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313 _mm256_storeu_ps((float*)c, ret);
314 c += 4;
315
316 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317 _mm256_storeu_ps((float*)c, ret);
318 c += 4;
319
320 a += 8;
321 b += 8;
322 }
323
324 number = oneEigthPoints * 8;
325 float* cFloatPtr = (float*)&cVector[number];
326 int8_t* a8Ptr = (int8_t*)&aVector[number];
327 int8_t* b8Ptr = (int8_t*)&bVector[number];
328 for (; number < num_points; number++) {
329 float aReal = (float)*a8Ptr++;
330 float aImag = (float)*a8Ptr++;
331 lv_32fc_t aVal = lv_cmake(aReal, aImag);
332 float bReal = (float)*b8Ptr++;
333 float bImag = (float)*b8Ptr++;
334 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335 lv_32fc_t temp = aVal * bVal;
336
337 *cFloatPtr++ = lv_creal(temp) / scalar;
338 *cFloatPtr++ = lv_cimag(temp) / scalar;
339 }
340}
341#endif /* LV_HAVE_AVX2*/
342
343
344#ifdef LV_HAVE_RVV
345#include <riscv_vector.h>
346
347static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
348 const lv_8sc_t* aVector,
349 const lv_8sc_t* bVector,
350 const float scalar,
351 unsigned int num_points)
352{
353 size_t n = num_points;
354 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
355 vl = __riscv_vsetvl_e8m1(n);
356 vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl);
357 vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl);
358 vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
359 vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
360 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
361 vint16m2_t vi =
362 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
363 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
364 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
365 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
366 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
367 vuint64m8_t v =
368 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
369 __riscv_vse64((uint64_t*)cVector, v, vl);
370 }
371}
372#endif /*LV_HAVE_RVV*/
373
374#ifdef LV_HAVE_RVVSEG
375#include <riscv_vector.h>
376
377static inline void
378volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
379 const lv_8sc_t* aVector,
380 const lv_8sc_t* bVector,
381 const float scalar,
382 unsigned int num_points)
383{
384 size_t n = num_points;
385 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
386 vl = __riscv_vsetvl_e8m1(n);
387 vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl);
388 vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl);
389 vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
390 vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
391 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
392 vint16m2_t vi =
393 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
394 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
395 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
396 __riscv_vsseg2e32_v_f32m4x2(
397 (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
398 }
399}
400
401#endif /*LV_HAVE_RVVSEG*/
402
403#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition volk_8ic_x2_s32f_multiply_conjugate_32fc.h:223
#define lv_cimag(x)
Definition volk_complex.h:98
#define lv_cmake(r, i)
Definition volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition volk_complex.h:70
#define lv_creal(x)
Definition volk_complex.h:96
float complex lv_32fc_t
Definition volk_complex.h:74