Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_reciprocal_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52
53#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
54#define INCLUDED_volk_32f_reciprocal_32f_a_H
55
56#ifdef LV_HAVE_GENERIC
57static inline void
58volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
59{
60 for (unsigned int i = 0; i < num_points; i++) {
61 out[i] = 1.f / in[i];
62 }
63}
64#endif /* LV_HAVE_GENERIC */
65
66#ifdef LV_HAVE_SSE
67#include <xmmintrin.h>
68static inline void
69volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points)
70{
71 const __m128 ONE = _mm_set_ps1(1.f);
72 const unsigned int quarter_points = num_points / 4;
73
74 for (unsigned int number = 0; number < quarter_points; number++) {
75 __m128 x = _mm_load_ps(in);
76 in += 4;
77 __m128 r = _mm_div_ps(ONE, x);
78 _mm_store_ps(out, r);
79 out += 4;
80 }
81
82 const unsigned int done = quarter_points * 4;
83
84 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
85}
86#endif /* LV_HAVE_SSE */
87
88#ifdef LV_HAVE_AVX
89#include <immintrin.h>
90static inline void
91volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
92{
93 const __m256 ONE = _mm256_set1_ps(1.f);
94 const unsigned int eighth_points = num_points / 8;
95
96 for (unsigned int number = 0; number < eighth_points; number++) {
97 __m256 x = _mm256_load_ps(in);
98 in += 8;
99 __m256 r = _mm256_div_ps(ONE, x);
100 _mm256_store_ps(out, r);
101 out += 8;
102 }
103
104 const unsigned int done = eighth_points * 8;
105
106 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
107}
108#endif /* LV_HAVE_AVX */
109
110#ifdef LV_HAVE_AVX512F
111#include <immintrin.h>
112static inline void
113volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
114{
115 const unsigned int sixteenth_points = num_points / 16;
116
117 for (unsigned int number = 0; number < sixteenth_points; number++) {
118 __m512 x = _mm512_load_ps(in);
119 in += 16;
120 __m512 r = _mm512_rcp14_ps(x);
121 _mm512_store_ps(out, r);
122 out += 16;
123 }
124
125 const unsigned int done = sixteenth_points * 16;
126
127 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
128}
129#endif /* LV_HAVE_AVX512F */
130
131#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */
132
133#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
134#define INCLUDED_volk_32f_reciprocal_32f_u_H
135
136#ifdef LV_HAVE_SSE
137#include <xmmintrin.h>
138static inline void
139volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points)
140{
141 const __m128 ONE = _mm_set_ps1(1.f);
142 const unsigned int quarter_points = num_points / 4;
143
144 for (unsigned int number = 0; number < quarter_points; number++) {
145 __m128 x = _mm_loadu_ps(in);
146 in += 4;
147 __m128 r = _mm_div_ps(ONE, x);
148 _mm_storeu_ps(out, r);
149 out += 4;
150 }
151
152 const unsigned int done = quarter_points * 4;
153
154 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
155}
156#endif /* LV_HAVE_SSE */
157
158#ifdef LV_HAVE_AVX
159#include <immintrin.h>
160static inline void
161volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
162{
163 const __m256 ONE = _mm256_set1_ps(1.f);
164 const unsigned int eighth_points = num_points / 8;
165
166 for (unsigned int number = 0; number < eighth_points; number++) {
167 __m256 x = _mm256_loadu_ps(in);
168 in += 8;
169 __m256 r = _mm256_div_ps(ONE, x);
170 _mm256_storeu_ps(out, r);
171 out += 8;
172 }
173
174 const unsigned int done = eighth_points * 8;
175
176 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
177}
178#endif /* LV_HAVE_AVX */
179
180#ifdef LV_HAVE_AVX512F
181#include <immintrin.h>
182static inline void
183volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
184{
185 const unsigned int sixteenth_points = num_points / 16;
186
187 for (unsigned int number = 0; number < sixteenth_points; number++) {
188 __m512 x = _mm512_loadu_ps(in);
189 in += 16;
190 __m512 r = _mm512_rcp14_ps(x);
191 _mm512_storeu_ps(out, r);
192 out += 16;
193 }
194
195 const unsigned int done = sixteenth_points * 16;
196
197 volk_32f_reciprocal_32f_generic(out, in, num_points - done);
198}
199#endif /* LV_HAVE_AVX512F */
200
201#ifdef LV_HAVE_RVV
202#include <riscv_vector.h>
203
204static inline void
205volk_32f_reciprocal_32f_rvv(float* out, const float* in, unsigned int num_points)
206{
207 size_t n = num_points;
208 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
209 vl = __riscv_vsetvl_e32m8(n);
210 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
211 __riscv_vse32(out, __riscv_vfrdiv(v, 1.0f, vl), vl);
212 }
213}
214#endif /*LV_HAVE_RVV*/
215
216#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
static void volk_32f_reciprocal_32f_a_sse(float *out, const float *in, unsigned int num_points)
Definition volk_32f_reciprocal_32f.h:69
static void volk_32f_reciprocal_32f_u_avx(float *out, const float *in, unsigned int num_points)
Definition volk_32f_reciprocal_32f.h:161
static void volk_32f_reciprocal_32f_a_avx(float *out, const float *in, unsigned int num_points)
Definition volk_32f_reciprocal_32f.h:91
static void volk_32f_reciprocal_32f_generic(float *out, const float *in, unsigned int num_points)
Definition volk_32f_reciprocal_32f.h:58
static void volk_32f_reciprocal_32f_u_sse(float *out, const float *in, unsigned int num_points)
Definition volk_32f_reciprocal_32f.h:139
for i
Definition volk_config_fixed.tmpl.h:13