34#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
35#define INCLUDED_volk_16ic_x2_multiply_16ic_H
45 unsigned int num_points)
48 for (n = 0; n < num_points; n++) {
49 result[n] = in_a[n] * in_b[n];
62 unsigned int num_points)
64 const unsigned int sse_iters = num_points / 4;
65 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
68 mask_imag = _mm_set_epi8(
69 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
70 mask_real = _mm_set_epi8(
71 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
78 for (number = 0; number < sse_iters; number++) {
81 b = _mm_load_si128((__m128i*)_in_b);
82 c = _mm_mullo_epi16(a, b);
84 c_sr = _mm_srli_si128(c, 2);
86 real = _mm_subs_epi16(c, c_sr);
87 real = _mm_and_si128(real,
90 b_sl = _mm_slli_si128(b, 2);
91 a_sl = _mm_slli_si128(a, 2);
93 imag1 = _mm_mullo_epi16(a, b_sl);
94 imag2 = _mm_mullo_epi16(b, a_sl);
96 imag = _mm_adds_epi16(imag1, imag2);
97 imag = _mm_and_si128(imag, mask_imag);
99 result = _mm_or_si128(real, imag);
101 _mm_store_si128((__m128i*)_out, result);
108 for (number = sse_iters * 4; number < num_points; ++number) {
109 *_out++ = (*_in_a++) * (*_in_b++);
116#include <emmintrin.h>
121 unsigned int num_points)
123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
127 mask_imag = _mm_set_epi8(
128 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
129 mask_real = _mm_set_epi8(
130 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
137 for (number = 0; number < sse_iters; number++) {
140 b = _mm_loadu_si128((__m128i*)_in_b);
141 c = _mm_mullo_epi16(a, b);
143 c_sr = _mm_srli_si128(c, 2);
145 real = _mm_subs_epi16(c, c_sr);
146 real = _mm_and_si128(real,
149 b_sl = _mm_slli_si128(b, 2);
150 a_sl = _mm_slli_si128(a, 2);
152 imag1 = _mm_mullo_epi16(a, b_sl);
153 imag2 = _mm_mullo_epi16(b, a_sl);
155 imag = _mm_adds_epi16(imag1, imag2);
156 imag = _mm_and_si128(imag, mask_imag);
158 result = _mm_or_si128(real, imag);
160 _mm_storeu_si128((__m128i*)_out, result);
167 for (number = sse_iters * 4; number < num_points; ++number) {
168 *_out++ = (*_in_a++) * (*_in_b++);
175#include <immintrin.h>
177static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int avx2_points = num_points / 8;
189 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
191 const __m256i mask_imag = _mm256_set_epi8(0xFF,
223 const __m256i mask_real = _mm256_set_epi8(0,
256 for (; number < avx2_points; number++) {
257 a = _mm256_loadu_si256(
259 b = _mm256_loadu_si256(
261 c = _mm256_mullo_epi16(a, b);
263 c_sr = _mm256_srli_si256(c, 2);
265 real = _mm256_subs_epi16(c, c_sr);
266 real = _mm256_and_si256(
269 b_sl = _mm256_slli_si256(b, 2);
270 a_sl = _mm256_slli_si256(a, 2);
272 imag1 = _mm256_mullo_epi16(a, b_sl);
273 imag2 = _mm256_mullo_epi16(b, a_sl);
275 imag = _mm256_adds_epi16(imag1, imag2);
276 imag = _mm256_and_si256(imag, mask_imag);
278 result = _mm256_or_si256(real, imag);
280 _mm256_storeu_si256((__m256i*)_out, result);
287 number = avx2_points * 8;
288 for (; number < num_points; number++) {
289 *_out++ = (*_in_a++) * (*_in_b++);
296#include <immintrin.h>
298static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
301 unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int avx2_points = num_points / 8;
310 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
312 const __m256i mask_imag = _mm256_set_epi8(0xFF,
344 const __m256i mask_real = _mm256_set_epi8(0,
377 for (; number < avx2_points; number++) {
378 a = _mm256_load_si256(
380 b = _mm256_load_si256(
382 c = _mm256_mullo_epi16(a, b);
384 c_sr = _mm256_srli_si256(c, 2);
386 real = _mm256_subs_epi16(c, c_sr);
387 real = _mm256_and_si256(
390 b_sl = _mm256_slli_si256(b, 2);
391 a_sl = _mm256_slli_si256(a, 2);
393 imag1 = _mm256_mullo_epi16(a, b_sl);
394 imag2 = _mm256_mullo_epi16(b, a_sl);
396 imag = _mm256_adds_epi16(imag1, imag2);
397 imag = _mm256_and_si256(imag, mask_imag);
399 result = _mm256_or_si256(real, imag);
401 _mm256_store_si256((__m256i*)_out, result);
408 number = avx2_points * 8;
409 for (; number < num_points; number++) {
410 *_out++ = (*_in_a++) * (*_in_b++);
422 unsigned int num_points)
426 unsigned int quarter_points = num_points / 4;
427 int16x4x2_t a_val, b_val, c_val;
428 int16x4x2_t tmp_real, tmp_imag;
429 unsigned int number = 0;
431 for (number = 0; number < quarter_points; ++number) {
432 a_val = vld2_s16((int16_t*)a_ptr);
433 b_val = vld2_s16((int16_t*)b_ptr);
439 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
441 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
445 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
447 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
450 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
451 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
452 vst2_s16((int16_t*)out, c_val);
459 for (number = quarter_points * 4; number < num_points; number++) {
460 *out++ = (*a_ptr++) * (*b_ptr++);
466#include <riscv_vector.h>
468static inline void volk_16ic_x2_multiply_16ic_rvv(
lv_16sc_t* result,
471 unsigned int num_points)
473 size_t n = num_points;
474 for (
size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) {
475 vl = __riscv_vsetvl_e16m4(n);
476 vint32m8_t va = __riscv_vle32_v_i32m8((
const int32_t*)in_a, vl);
477 vint32m8_t vb = __riscv_vle32_v_i32m8((
const int32_t*)in_b, vl);
478 vint16m4_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 16, vl);
479 vint16m4_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 16, vl);
480 vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
481 vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
482 vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
483 vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
484 vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
485 __riscv_vse32((uint32_t*)result, v, vl);
491#include <riscv_vector.h>
493static inline void volk_16ic_x2_multiply_16ic_rvvseg(
lv_16sc_t* result,
496 unsigned int num_points)
498 size_t n = num_points;
499 for (
size_t vl; n > 0; n -= vl, in_a += vl, in_b += vl, result += vl) {
500 vl = __riscv_vsetvl_e16m4(n);
501 vint16m4x2_t va = __riscv_vlseg2e16_v_i16m4x2((
const int16_t*)in_a, vl);
502 vint16m4x2_t vb = __riscv_vlseg2e16_v_i16m4x2((
const int16_t*)in_b, vl);
503 vint16m4_t var = __riscv_vget_i16m4(va, 0), vai = __riscv_vget_i16m4(va, 1);
504 vint16m4_t vbr = __riscv_vget_i16m4(vb, 0), vbi = __riscv_vget_i16m4(vb, 1);
505 vint16m4_t vr = __riscv_vnmsac(__riscv_vmul(var, vbr, vl), vai, vbi, vl);
506 vint16m4_t vi = __riscv_vmacc(__riscv_vmul(var, vbi, vl), vai, vbr, vl);
507 __riscv_vsseg2e16_v_i16m4x2(
508 (int16_t*)result, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition volk_16ic_x2_multiply_16ic.h:59
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition volk_16ic_x2_multiply_16ic.h:118
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition volk_16ic_x2_multiply_16ic.h:42
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition volk_16ic_x2_multiply_16ic.h:419
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68
short complex lv_16sc_t
Definition volk_complex.h:71