Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
49
50#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51#define INCLUDED_volk_32f_accumulator_s32f_a_H
52
53#include <inttypes.h>
54#include <volk/volk_common.h>
55
56#ifdef LV_HAVE_AVX
57#include <immintrin.h>
58
59static inline void volk_32f_accumulator_s32f_a_avx(float* result,
60 const float* inputBuffer,
61 unsigned int num_points)
62{
63 float returnValue = 0;
64 unsigned int number = 0;
65 const unsigned int eighthPoints = num_points / 8;
66
67 const float* aPtr = inputBuffer;
68 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
69
70 __m256 accumulator = _mm256_setzero_ps();
71 __m256 aVal = _mm256_setzero_ps();
72
73 for (; number < eighthPoints; number++) {
74 aVal = _mm256_load_ps(aPtr);
75 accumulator = _mm256_add_ps(accumulator, aVal);
76 aPtr += 8;
77 }
78
79 _mm256_store_ps(tempBuffer, accumulator);
80
81 returnValue = tempBuffer[0];
82 returnValue += tempBuffer[1];
83 returnValue += tempBuffer[2];
84 returnValue += tempBuffer[3];
85 returnValue += tempBuffer[4];
86 returnValue += tempBuffer[5];
87 returnValue += tempBuffer[6];
88 returnValue += tempBuffer[7];
89
90 number = eighthPoints * 8;
91 for (; number < num_points; number++) {
92 returnValue += (*aPtr++);
93 }
94 *result = returnValue;
95}
96#endif /* LV_HAVE_AVX */
97
98
99#ifdef LV_HAVE_AVX
100#include <immintrin.h>
101
102static inline void volk_32f_accumulator_s32f_u_avx(float* result,
103 const float* inputBuffer,
104 unsigned int num_points)
105{
106 float returnValue = 0;
107 unsigned int number = 0;
108 const unsigned int eighthPoints = num_points / 8;
109
110 const float* aPtr = inputBuffer;
111 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
112
113 __m256 accumulator = _mm256_setzero_ps();
114 __m256 aVal = _mm256_setzero_ps();
115
116 for (; number < eighthPoints; number++) {
117 aVal = _mm256_loadu_ps(aPtr);
118 accumulator = _mm256_add_ps(accumulator, aVal);
119 aPtr += 8;
120 }
121
122 _mm256_store_ps(tempBuffer, accumulator);
123
124 returnValue = tempBuffer[0];
125 returnValue += tempBuffer[1];
126 returnValue += tempBuffer[2];
127 returnValue += tempBuffer[3];
128 returnValue += tempBuffer[4];
129 returnValue += tempBuffer[5];
130 returnValue += tempBuffer[6];
131 returnValue += tempBuffer[7];
132
133 number = eighthPoints * 8;
134 for (; number < num_points; number++) {
135 returnValue += (*aPtr++);
136 }
137 *result = returnValue;
138}
139#endif /* LV_HAVE_AVX */
140
141
142#ifdef LV_HAVE_SSE
143#include <xmmintrin.h>
144
145static inline void volk_32f_accumulator_s32f_a_sse(float* result,
146 const float* inputBuffer,
147 unsigned int num_points)
148{
149 float returnValue = 0;
150 unsigned int number = 0;
151 const unsigned int quarterPoints = num_points / 4;
152
153 const float* aPtr = inputBuffer;
154 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
155
156 __m128 accumulator = _mm_setzero_ps();
157 __m128 aVal = _mm_setzero_ps();
158
159 for (; number < quarterPoints; number++) {
160 aVal = _mm_load_ps(aPtr);
161 accumulator = _mm_add_ps(accumulator, aVal);
162 aPtr += 4;
163 }
164
165 _mm_store_ps(tempBuffer, accumulator);
166
167 returnValue = tempBuffer[0];
168 returnValue += tempBuffer[1];
169 returnValue += tempBuffer[2];
170 returnValue += tempBuffer[3];
171
172 number = quarterPoints * 4;
173 for (; number < num_points; number++) {
174 returnValue += (*aPtr++);
175 }
176 *result = returnValue;
177}
178#endif /* LV_HAVE_SSE */
179
180
181#ifdef LV_HAVE_SSE
182#include <xmmintrin.h>
183
184static inline void volk_32f_accumulator_s32f_u_sse(float* result,
185 const float* inputBuffer,
186 unsigned int num_points)
187{
188 float returnValue = 0;
189 unsigned int number = 0;
190 const unsigned int quarterPoints = num_points / 4;
191
192 const float* aPtr = inputBuffer;
193 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
194
195 __m128 accumulator = _mm_setzero_ps();
196 __m128 aVal = _mm_setzero_ps();
197
198 for (; number < quarterPoints; number++) {
199 aVal = _mm_loadu_ps(aPtr);
200 accumulator = _mm_add_ps(accumulator, aVal);
201 aPtr += 4;
202 }
203
204 _mm_store_ps(tempBuffer, accumulator);
205
206 returnValue = tempBuffer[0];
207 returnValue += tempBuffer[1];
208 returnValue += tempBuffer[2];
209 returnValue += tempBuffer[3];
210
211 number = quarterPoints * 4;
212 for (; number < num_points; number++) {
213 returnValue += (*aPtr++);
214 }
215 *result = returnValue;
216}
217#endif /* LV_HAVE_SSE */
218
219#ifdef LV_HAVE_GENERIC
220static inline void volk_32f_accumulator_s32f_generic(float* result,
221 const float* inputBuffer,
222 unsigned int num_points)
223{
224 const float* aPtr = inputBuffer;
225 unsigned int number = 0;
226 float returnValue = 0;
227
228 for (; number < num_points; number++) {
229 returnValue += (*aPtr++);
230 }
231 *result = returnValue;
232}
233#endif /* LV_HAVE_GENERIC */
234
235#ifdef LV_HAVE_RVV
236#include <riscv_vector.h>
238
239static inline void volk_32f_accumulator_s32f_rvv(float* result,
240 const float* inputBuffer,
241 unsigned int num_points)
242{
243 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
244 size_t n = num_points;
245 for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
246 vl = __riscv_vsetvl_e32m8(n);
247 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
248 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
249 }
250 size_t vl = __riscv_vsetvlmax_e32m1();
251 vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
252 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
253 *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
254}
255#endif /*LV_HAVE_RVV*/
256
257#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:59
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:184
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:220
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:102
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition volk_32f_accumulator_s32f.h:145
#define __VOLK_ATTR_ALIGNED(x)
Definition volk_common.h:62
#define RISCV_SHRINK8(op, T, S, v)
Definition volk_rvv_intrinsics.h:33