Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_64f_multiply_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32f_64f_multiply_64f_H
59#define INCLUDED_volk_32f_64f_multiply_64f_H
60
61#include <inttypes.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
67 const float* aVector,
68 const double* bVector,
69 unsigned int num_points)
70{
71 double* cPtr = cVector;
72 const float* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
75
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
78 }
79}
80
81#endif /* LV_HAVE_GENERIC */
82
83/*
84 * Unaligned versions
85 */
86
87
88#ifdef LV_HAVE_AVX
89
90#include <immintrin.h>
91#include <xmmintrin.h>
92
93static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
94 const float* aVector,
95 const double* bVector,
96 unsigned int num_points)
97{
98 unsigned int number = 0;
99 const unsigned int eighth_points = num_points / 8;
100
101 double* cPtr = cVector;
102 const float* aPtr = aVector;
103 const double* bPtr = bVector;
104
105 __m256 aVal;
106 __m128 aVal1, aVal2;
107 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
108 for (; number < eighth_points; number++) {
109
110 aVal = _mm256_loadu_ps(aPtr);
111 bVal1 = _mm256_loadu_pd(bPtr);
112 bVal2 = _mm256_loadu_pd(bPtr + 4);
113
114 aVal1 = _mm256_extractf128_ps(aVal, 0);
115 aVal2 = _mm256_extractf128_ps(aVal, 1);
116
117 aDbl1 = _mm256_cvtps_pd(aVal1);
118 aDbl2 = _mm256_cvtps_pd(aVal2);
119
120 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
121 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
122
123 _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
124 _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
125
126 aPtr += 8;
127 bPtr += 8;
128 cPtr += 8;
129 }
130
131 number = eighth_points * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
134 }
135}
136
137#endif /* LV_HAVE_AVX */
138
139
140#ifdef LV_HAVE_AVX
141
142#include <immintrin.h>
143#include <xmmintrin.h>
144
145static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
146 const float* aVector,
147 const double* bVector,
148 unsigned int num_points)
149{
150 unsigned int number = 0;
151 const unsigned int eighth_points = num_points / 8;
152
153 double* cPtr = cVector;
154 const float* aPtr = aVector;
155 const double* bPtr = bVector;
156
157 __m256 aVal;
158 __m128 aVal1, aVal2;
159 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
160 for (; number < eighth_points; number++) {
161
162 aVal = _mm256_load_ps(aPtr);
163 bVal1 = _mm256_load_pd(bPtr);
164 bVal2 = _mm256_load_pd(bPtr + 4);
165
166 aVal1 = _mm256_extractf128_ps(aVal, 0);
167 aVal2 = _mm256_extractf128_ps(aVal, 1);
168
169 aDbl1 = _mm256_cvtps_pd(aVal1);
170 aDbl2 = _mm256_cvtps_pd(aVal2);
171
172 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
173 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
174
175 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
176 _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
177
178 aPtr += 8;
179 bPtr += 8;
180 cPtr += 8;
181 }
182
183 number = eighth_points * 8;
184 for (; number < num_points; number++) {
185 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
186 }
187}
188
189#endif /* LV_HAVE_AVX */
190
191#ifdef LV_HAVE_RVV
192#include <riscv_vector.h>
193
194static inline void volk_32f_64f_multiply_64f_rvv(double* cVector,
195 const float* aVector,
196 const double* bVector,
197 unsigned int num_points)
198{
199 size_t n = num_points;
200 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
201 vl = __riscv_vsetvl_e64m8(n);
202 vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
203 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
204 __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);
205 }
206}
207#endif /*LV_HAVE_RVV*/
208
209#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
static void volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_multiply_64f.h:66
static void volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_multiply_64f.h:93
static void volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_multiply_64f.h:145