Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_64f_x2_multiply_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_64f_x2_multiply_64f_H
59#define INCLUDED_volk_64f_x2_multiply_64f_H
60
61#include <inttypes.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
67 const double* aVector,
68 const double* bVector,
69 unsigned int num_points)
70{
71 double* cPtr = cVector;
72 const double* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
75
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = (*aPtr++) * (*bPtr++);
78 }
79}
80
81#endif /* LV_HAVE_GENERIC */
82
83/*
84 * Unaligned versions
85 */
86
87#ifdef LV_HAVE_SSE2
88
89#include <emmintrin.h>
90
91static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
92 const double* aVector,
93 const double* bVector,
94 unsigned int num_points)
95{
96 unsigned int number = 0;
97 const unsigned int half_points = num_points / 2;
98
99 double* cPtr = cVector;
100 const double* aPtr = aVector;
101 const double* bPtr = bVector;
102
103 __m128d aVal, bVal, cVal;
104 for (; number < half_points; number++) {
105 aVal = _mm_loadu_pd(aPtr);
106 bVal = _mm_loadu_pd(bPtr);
107
108 cVal = _mm_mul_pd(aVal, bVal);
109
110 _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
111
112 aPtr += 2;
113 bPtr += 2;
114 cPtr += 2;
115 }
116
117 number = half_points * 2;
118 for (; number < num_points; number++) {
119 *cPtr++ = (*aPtr++) * (*bPtr++);
120 }
121}
122
123#endif /* LV_HAVE_SSE2 */
124
125
126#ifdef LV_HAVE_AVX
127
128#include <immintrin.h>
129
130static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
131 const double* aVector,
132 const double* bVector,
133 unsigned int num_points)
134{
135 unsigned int number = 0;
136 const unsigned int quarter_points = num_points / 4;
137
138 double* cPtr = cVector;
139 const double* aPtr = aVector;
140 const double* bPtr = bVector;
141
142 __m256d aVal, bVal, cVal;
143 for (; number < quarter_points; number++) {
144
145 aVal = _mm256_loadu_pd(aPtr);
146 bVal = _mm256_loadu_pd(bPtr);
147
148 cVal = _mm256_mul_pd(aVal, bVal);
149
150 _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
151
152 aPtr += 4;
153 bPtr += 4;
154 cPtr += 4;
155 }
156
157 number = quarter_points * 4;
158 for (; number < num_points; number++) {
159 *cPtr++ = (*aPtr++) * (*bPtr++);
160 }
161}
162
163#endif /* LV_HAVE_AVX */
164
165/*
166 * Aligned versions
167 */
168
169#ifdef LV_HAVE_SSE2
170
171#include <emmintrin.h>
172
173static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
174 const double* aVector,
175 const double* bVector,
176 unsigned int num_points)
177{
178 unsigned int number = 0;
179 const unsigned int half_points = num_points / 2;
180
181 double* cPtr = cVector;
182 const double* aPtr = aVector;
183 const double* bPtr = bVector;
184
185 __m128d aVal, bVal, cVal;
186 for (; number < half_points; number++) {
187 aVal = _mm_load_pd(aPtr);
188 bVal = _mm_load_pd(bPtr);
189
190 cVal = _mm_mul_pd(aVal, bVal);
191
192 _mm_store_pd(cPtr, cVal); // Store the results back into the C container
193
194 aPtr += 2;
195 bPtr += 2;
196 cPtr += 2;
197 }
198
199 number = half_points * 2;
200 for (; number < num_points; number++) {
201 *cPtr++ = (*aPtr++) * (*bPtr++);
202 }
203}
204
205#endif /* LV_HAVE_SSE2 */
206
207
208#ifdef LV_HAVE_AVX
209
210#include <immintrin.h>
211
212static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
213 const double* aVector,
214 const double* bVector,
215 unsigned int num_points)
216{
217 unsigned int number = 0;
218 const unsigned int quarter_points = num_points / 4;
219
220 double* cPtr = cVector;
221 const double* aPtr = aVector;
222 const double* bPtr = bVector;
223
224 __m256d aVal, bVal, cVal;
225 for (; number < quarter_points; number++) {
226
227 aVal = _mm256_load_pd(aPtr);
228 bVal = _mm256_load_pd(bPtr);
229
230 cVal = _mm256_mul_pd(aVal, bVal);
231
232 _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
233
234 aPtr += 4;
235 bPtr += 4;
236 cPtr += 4;
237 }
238
239 number = quarter_points * 4;
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) * (*bPtr++);
242 }
243}
244
245#endif /* LV_HAVE_AVX */
246
247#ifdef LV_HAVE_RVV
248#include <riscv_vector.h>
249
250static inline void volk_64f_x2_multiply_64f_rvv(double* cVector,
251 const double* aVector,
252 const double* bVector,
253 unsigned int num_points)
254{
255 size_t n = num_points;
256 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
257 vl = __riscv_vsetvl_e64m8(n);
258 vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
259 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
260 __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);
261 }
262}
263#endif /*LV_HAVE_RVV*/
264
265#endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */
static void volk_64f_x2_multiply_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition volk_64f_x2_multiply_64f.h:173
static void volk_64f_x2_multiply_64f_u_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition volk_64f_x2_multiply_64f.h:91
static void volk_64f_x2_multiply_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition volk_64f_x2_multiply_64f.h:66
static void volk_64f_x2_multiply_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition volk_64f_x2_multiply_64f.h:130
static void volk_64f_x2_multiply_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition volk_64f_x2_multiply_64f.h:212