Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32f_64f_add_64f_H
61#define INCLUDED_volk_32f_64f_add_64f_H
62
63#include <inttypes.h>
64
65#ifdef LV_HAVE_GENERIC
66
67static inline void volk_32f_64f_add_64f_generic(double* cVector,
68 const float* aVector,
69 const double* bVector,
70 unsigned int num_points)
71{
72 double* cPtr = cVector;
73 const float* aPtr = aVector;
74 const double* bPtr = bVector;
75 unsigned int number = 0;
76
77 for (number = 0; number < num_points; number++) {
78 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
79 }
80}
81
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_NEONV8
85#include <arm_neon.h>
86
87static inline void volk_32f_64f_add_64f_neon(double* cVector,
88 const float* aVector,
89 const double* bVector,
90 unsigned int num_points)
91{
92 unsigned int number = 0;
93 const unsigned int half_points = num_points / 2;
94
95 double* cPtr = cVector;
96 const float* aPtr = aVector;
97 const double* bPtr = bVector;
98
99 float64x2_t aVal, bVal, cVal;
100 float32x2_t aVal1;
101 for (number = 0; number < half_points; number++) {
102 // Load in to NEON registers
103 aVal1 = vld1_f32(aPtr);
104 bVal = vld1q_f64(bPtr);
105 __VOLK_PREFETCH(aPtr + 2);
106 __VOLK_PREFETCH(bPtr + 2);
107 aPtr += 2; // q uses quadwords, 4 floats per vadd
108 bPtr += 2;
109
110 // Vector conversion
111 aVal = vcvt_f64_f32(aVal1);
112 // vector add
113 cVal = vaddq_f64(aVal, bVal);
114 // Store the results back into the C container
115 vst1q_f64(cPtr, cVal);
116
117 cPtr += 2;
118 }
119
120 number = half_points * 2; // should be = num_points
121 for (; number < num_points; number++) {
122 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
123 }
124}
125
126#endif /* LV_HAVE_NEONV8 */
127
128#ifdef LV_HAVE_AVX
129
130#include <immintrin.h>
131#include <xmmintrin.h>
132
133static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
134 const float* aVector,
135 const double* bVector,
136 unsigned int num_points)
137{
138 unsigned int number = 0;
139 const unsigned int eighth_points = num_points / 8;
140
141 double* cPtr = cVector;
142 const float* aPtr = aVector;
143 const double* bPtr = bVector;
144
145 __m256 aVal;
146 __m128 aVal1, aVal2;
147 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
148 for (; number < eighth_points; number++) {
149
150 aVal = _mm256_loadu_ps(aPtr);
151 bVal1 = _mm256_loadu_pd(bPtr);
152 bVal2 = _mm256_loadu_pd(bPtr + 4);
153
154 aVal1 = _mm256_extractf128_ps(aVal, 0);
155 aVal2 = _mm256_extractf128_ps(aVal, 1);
156
157 aDbl1 = _mm256_cvtps_pd(aVal1);
158 aDbl2 = _mm256_cvtps_pd(aVal2);
159
160 cVal1 = _mm256_add_pd(aDbl1, bVal1);
161 cVal2 = _mm256_add_pd(aDbl2, bVal2);
162
163 _mm256_storeu_pd(cPtr,
164 cVal1); // Store the results back into the C container
165 _mm256_storeu_pd(cPtr + 4,
166 cVal2); // Store the results back into the C container
167
168 aPtr += 8;
169 bPtr += 8;
170 cPtr += 8;
171 }
172
173 number = eighth_points * 8;
174 for (; number < num_points; number++) {
175 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
176 }
177}
178
179#endif /* LV_HAVE_AVX */
180
181#ifdef LV_HAVE_AVX
182
183#include <immintrin.h>
184#include <xmmintrin.h>
185
186static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
187 const float* aVector,
188 const double* bVector,
189 unsigned int num_points)
190{
191 unsigned int number = 0;
192 const unsigned int eighth_points = num_points / 8;
193
194 double* cPtr = cVector;
195 const float* aPtr = aVector;
196 const double* bPtr = bVector;
197
198 __m256 aVal;
199 __m128 aVal1, aVal2;
200 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
201 for (; number < eighth_points; number++) {
202
203 aVal = _mm256_load_ps(aPtr);
204 bVal1 = _mm256_load_pd(bPtr);
205 bVal2 = _mm256_load_pd(bPtr + 4);
206
207 aVal1 = _mm256_extractf128_ps(aVal, 0);
208 aVal2 = _mm256_extractf128_ps(aVal, 1);
209
210 aDbl1 = _mm256_cvtps_pd(aVal1);
211 aDbl2 = _mm256_cvtps_pd(aVal2);
212
213 cVal1 = _mm256_add_pd(aDbl1, bVal1);
214 cVal2 = _mm256_add_pd(aDbl2, bVal2);
215
216 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
217 _mm256_store_pd(cPtr + 4,
218 cVal2); // Store the results back into the C container
219
220 aPtr += 8;
221 bPtr += 8;
222 cPtr += 8;
223 }
224
225 number = eighth_points * 8;
226 for (; number < num_points; number++) {
227 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
228 }
229}
230
231#endif /* LV_HAVE_AVX */
232
233#ifdef LV_HAVE_RVV
234#include <riscv_vector.h>
235
236static inline void volk_32f_64f_add_64f_rvv(double* cVector,
237 const float* aVector,
238 const double* bVector,
239 unsigned int num_points)
240{
241 size_t n = num_points;
242 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
243 vl = __riscv_vsetvl_e64m8(n);
244 vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
245 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
246 __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
247 }
248}
249#endif /*LV_HAVE_RVV*/
250
251#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_add_64f.h:186
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_add_64f.h:133
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition volk_32f_64f_add_64f.h:67
#define __VOLK_PREFETCH(addr)
Definition volk_common.h:68