50#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51#define INCLUDED_volk_32f_accumulator_s32f_a_H
60 const float* inputBuffer,
61 unsigned int num_points)
63 float returnValue = 0;
64 unsigned int number = 0;
65 const unsigned int eighthPoints = num_points / 8;
67 const float* aPtr = inputBuffer;
70 __m256 accumulator = _mm256_setzero_ps();
71 __m256 aVal = _mm256_setzero_ps();
73 for (; number < eighthPoints; number++) {
74 aVal = _mm256_load_ps(aPtr);
75 accumulator = _mm256_add_ps(accumulator, aVal);
79 _mm256_store_ps(tempBuffer, accumulator);
81 returnValue = tempBuffer[0];
82 returnValue += tempBuffer[1];
83 returnValue += tempBuffer[2];
84 returnValue += tempBuffer[3];
85 returnValue += tempBuffer[4];
86 returnValue += tempBuffer[5];
87 returnValue += tempBuffer[6];
88 returnValue += tempBuffer[7];
90 number = eighthPoints * 8;
91 for (; number < num_points; number++) {
92 returnValue += (*aPtr++);
94 *result = returnValue;
100#include <immintrin.h>
103 const float* inputBuffer,
104 unsigned int num_points)
106 float returnValue = 0;
107 unsigned int number = 0;
108 const unsigned int eighthPoints = num_points / 8;
110 const float* aPtr = inputBuffer;
113 __m256 accumulator = _mm256_setzero_ps();
114 __m256 aVal = _mm256_setzero_ps();
116 for (; number < eighthPoints; number++) {
117 aVal = _mm256_loadu_ps(aPtr);
118 accumulator = _mm256_add_ps(accumulator, aVal);
122 _mm256_store_ps(tempBuffer, accumulator);
124 returnValue = tempBuffer[0];
125 returnValue += tempBuffer[1];
126 returnValue += tempBuffer[2];
127 returnValue += tempBuffer[3];
128 returnValue += tempBuffer[4];
129 returnValue += tempBuffer[5];
130 returnValue += tempBuffer[6];
131 returnValue += tempBuffer[7];
133 number = eighthPoints * 8;
134 for (; number < num_points; number++) {
135 returnValue += (*aPtr++);
137 *result = returnValue;
143#include <xmmintrin.h>
146 const float* inputBuffer,
147 unsigned int num_points)
149 float returnValue = 0;
150 unsigned int number = 0;
151 const unsigned int quarterPoints = num_points / 4;
153 const float* aPtr = inputBuffer;
156 __m128 accumulator = _mm_setzero_ps();
157 __m128 aVal = _mm_setzero_ps();
159 for (; number < quarterPoints; number++) {
160 aVal = _mm_load_ps(aPtr);
161 accumulator = _mm_add_ps(accumulator, aVal);
165 _mm_store_ps(tempBuffer, accumulator);
167 returnValue = tempBuffer[0];
168 returnValue += tempBuffer[1];
169 returnValue += tempBuffer[2];
170 returnValue += tempBuffer[3];
172 number = quarterPoints * 4;
173 for (; number < num_points; number++) {
174 returnValue += (*aPtr++);
176 *result = returnValue;
182#include <xmmintrin.h>
185 const float* inputBuffer,
186 unsigned int num_points)
188 float returnValue = 0;
189 unsigned int number = 0;
190 const unsigned int quarterPoints = num_points / 4;
192 const float* aPtr = inputBuffer;
195 __m128 accumulator = _mm_setzero_ps();
196 __m128 aVal = _mm_setzero_ps();
198 for (; number < quarterPoints; number++) {
199 aVal = _mm_loadu_ps(aPtr);
200 accumulator = _mm_add_ps(accumulator, aVal);
204 _mm_store_ps(tempBuffer, accumulator);
206 returnValue = tempBuffer[0];
207 returnValue += tempBuffer[1];
208 returnValue += tempBuffer[2];
209 returnValue += tempBuffer[3];
211 number = quarterPoints * 4;
212 for (; number < num_points; number++) {
213 returnValue += (*aPtr++);
215 *result = returnValue;
219#ifdef LV_HAVE_GENERIC
221 const float* inputBuffer,
222 unsigned int num_points)
224 const float* aPtr = inputBuffer;
225 unsigned int number = 0;
226 float returnValue = 0;
228 for (; number < num_points; number++) {
229 returnValue += (*aPtr++);
231 *result = returnValue;
236#include <riscv_vector.h>
239static inline void volk_32f_accumulator_s32f_rvv(
float* result,
240 const float* inputBuffer,
241 unsigned int num_points)
243 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
244 size_t n = num_points;
245 for (
size_t vl; n > 0; n -= vl, inputBuffer += vl) {
246 vl = __riscv_vsetvl_e32m8(n);
247 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
248 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
250 size_t vl = __riscv_vsetvlmax_e32m1();
252 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
253 *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:59
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:184
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:220
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:102
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:145
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define RISCV_SHRINK8(op, T, S, v)
Definition volk_rvv_intrinsics.h:33