44#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
45#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
52 unsigned int num_points)
54 unsigned int number = 0;
55 for (; number < num_points; number++) {
58 }
else if (*in < min) {
71static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(
float* out,
75 unsigned int num_points)
77 const __m256 vmin = _mm256_set1_ps(min);
78 const __m256 vmax = _mm256_set1_ps(max);
80 unsigned int number = 0;
81 unsigned int eighth_points = num_points / 8;
82 for (; number < eighth_points; number++) {
83 __m256 res = _mm256_load_ps(in);
84 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
85 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
86 res = _mm256_blendv_ps(res, vmax, max_mask);
87 res = _mm256_blendv_ps(res, vmin, min_mask);
88 _mm256_store_ps(out, res);
93 number = eighth_points * 8;
100static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(
float* out,
104 unsigned int num_points)
106 const __m128 vmin = _mm_set1_ps(min);
107 const __m128 vmax = _mm_set1_ps(max);
109 unsigned int number = 0;
110 unsigned int quarter_points = num_points / 4;
111 for (; number < quarter_points; number++) {
112 __m128 res = _mm_load_ps(in);
113 __m128 max_mask = _mm_cmplt_ps(vmax, res);
114 __m128 min_mask = _mm_cmplt_ps(res, vmin);
115 res = _mm_blendv_ps(res, vmax, max_mask);
116 res = _mm_blendv_ps(res, vmin, min_mask);
117 _mm_store_ps(out, res);
122 number = quarter_points * 4;
129#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
130#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
133#include <immintrin.h>
134static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(
float* out,
138 unsigned int num_points)
140 const __m256 vmin = _mm256_set1_ps(min);
141 const __m256 vmax = _mm256_set1_ps(max);
143 unsigned int number = 0;
144 unsigned int eighth_points = num_points / 8;
145 for (; number < eighth_points; number++) {
146 __m256 res = _mm256_loadu_ps(in);
147 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
148 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
149 res = _mm256_blendv_ps(res, vmax, max_mask);
150 res = _mm256_blendv_ps(res, vmin, min_mask);
151 _mm256_storeu_ps(out, res);
156 number = eighth_points * 8;
162#include <immintrin.h>
163static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(
float* out,
167 unsigned int num_points)
169 const __m128 vmin = _mm_set1_ps(min);
170 const __m128 vmax = _mm_set1_ps(max);
172 unsigned int number = 0;
173 unsigned int quarter_points = num_points / 4;
174 for (; number < quarter_points; number++) {
175 __m128 res = _mm_loadu_ps(in);
176 __m128 max_mask = _mm_cmplt_ps(vmax, res);
177 __m128 min_mask = _mm_cmplt_ps(res, vmin);
178 res = _mm_blendv_ps(res, vmax, max_mask);
179 res = _mm_blendv_ps(res, vmin, min_mask);
180 _mm_storeu_ps(out, res);
185 number = quarter_points * 4;
191#include <riscv_vector.h>
193static inline void volk_32f_s32f_x2_clamp_32f_rvv(
float* out,
197 unsigned int num_points)
199 vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8());
200 vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8());
201 size_t n = num_points;
202 for (
size_t vl; n > 0; n -= vl, in += vl, out += vl) {
203 vl = __riscv_vsetvl_e32m8(n);
204 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
205 v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl);
206 __riscv_vse32(out, v, vl);
static void volk_32f_s32f_x2_clamp_32f_generic(float *out, const float *in, const float min, const float max, unsigned int num_points)
Definition volk_32f_s32f_x2_clamp_32f.h:48