32#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
38 const float* inputVector,
39 const float lower_bound,
40 const float upper_bound,
41 unsigned int num_points)
43 float* outPtr = outputVector;
45 const float distance = upper_bound - lower_bound;
47 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
49 if (val < lower_bound) {
50 float excess = lower_bound - val;
51 signed int count = (int)(excess / distance);
52 *outPtr = val + (count + 1) * distance;
53 }
else if (val > upper_bound) {
54 float excess = val - upper_bound;
55 signed int count = (int)(excess / distance);
56 *outPtr = val - (count + 1) * distance;
69 const float* inputVector,
70 const float lower_bound,
71 const float upper_bound,
72 unsigned int num_points)
74 const __m256 lower = _mm256_set1_ps(lower_bound);
75 const __m256 upper = _mm256_set1_ps(upper_bound);
76 const __m256 distance = _mm256_sub_ps(upper, lower);
78 __m256 is_smaller, is_bigger;
81 const float* inPtr = inputVector;
82 float* outPtr = outputVector;
83 const size_t eight_points = num_points / 8;
84 for (
size_t counter = 0; counter < eight_points; counter++) {
85 input = _mm256_loadu_ps(inPtr);
87 is_smaller = _mm256_cmp_ps(
88 input, lower, _CMP_LT_OQ);
89 is_bigger = _mm256_cmp_ps(
90 input, upper, _CMP_GT_OQ);
92 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
94 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
96 excess = _mm256_div_ps(excess, distance);
98 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
100 adj = _mm256_set1_ps(1.0f);
101 excess = _mm256_add_ps(excess, adj);
103 adj = _mm256_and_ps(adj, is_smaller);
104 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
106 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107 output = _mm256_add_ps(input, excess);
108 _mm256_storeu_ps(outPtr, output);
114 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
117 const float* inputVector,
118 const float lower_bound,
119 const float upper_bound,
120 unsigned int num_points)
122 const __m256 lower = _mm256_set1_ps(lower_bound);
123 const __m256 upper = _mm256_set1_ps(upper_bound);
124 const __m256 distance = _mm256_sub_ps(upper, lower);
125 __m256 input, output;
126 __m256 is_smaller, is_bigger;
129 const float* inPtr = inputVector;
130 float* outPtr = outputVector;
131 const size_t eight_points = num_points / 8;
132 for (
size_t counter = 0; counter < eight_points; counter++) {
133 input = _mm256_load_ps(inPtr);
135 is_smaller = _mm256_cmp_ps(
136 input, lower, _CMP_LT_OQ);
137 is_bigger = _mm256_cmp_ps(
138 input, upper, _CMP_GT_OQ);
140 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
142 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
144 excess = _mm256_div_ps(excess, distance);
146 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
148 adj = _mm256_set1_ps(1.0f);
149 excess = _mm256_add_ps(excess, adj);
151 adj = _mm256_and_ps(adj, is_smaller);
152 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
154 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155 output = _mm256_add_ps(input, excess);
156 _mm256_store_ps(outPtr, output);
162 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
168#include <xmmintrin.h>
171 const float* inputVector,
172 const float lower_bound,
173 const float upper_bound,
174 unsigned int num_points)
176 const __m128 lower = _mm_set_ps1(lower_bound);
177 const __m128 upper = _mm_set_ps1(upper_bound);
178 const __m128 distance = _mm_sub_ps(upper, lower);
179 __m128 input, output;
180 __m128 is_smaller, is_bigger;
183 const float* inPtr = inputVector;
184 float* outPtr = outputVector;
185 const size_t quarter_points = num_points / 4;
186 for (
size_t counter = 0; counter < quarter_points; counter++) {
187 input = _mm_loadu_ps(inPtr);
189 is_smaller = _mm_cmplt_ps(input, lower);
190 is_bigger = _mm_cmpgt_ps(input, upper);
192 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
195 excess = _mm_div_ps(excess, distance);
197 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
199 adj = _mm_set_ps1(1.0f);
200 excess = _mm_add_ps(excess, adj);
202 adj = _mm_and_ps(adj, is_smaller);
203 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
205 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206 output = _mm_add_ps(input, excess);
207 _mm_storeu_ps(outPtr, output);
213 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
216 const float* inputVector,
217 const float lower_bound,
218 const float upper_bound,
219 unsigned int num_points)
221 const __m128 lower = _mm_set_ps1(lower_bound);
222 const __m128 upper = _mm_set_ps1(upper_bound);
223 const __m128 distance = _mm_sub_ps(upper, lower);
224 __m128 input, output;
225 __m128 is_smaller, is_bigger;
228 const float* inPtr = inputVector;
229 float* outPtr = outputVector;
230 const size_t quarter_points = num_points / 4;
231 for (
size_t counter = 0; counter < quarter_points; counter++) {
232 input = _mm_load_ps(inPtr);
234 is_smaller = _mm_cmplt_ps(input, lower);
235 is_bigger = _mm_cmpgt_ps(input, upper);
237 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
240 excess = _mm_div_ps(excess, distance);
243 excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
245 adj = _mm_set_ps1(1.0f);
246 excess = _mm_add_ps(excess, adj);
248 adj = _mm_and_ps(adj, is_smaller);
249 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
251 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252 output = _mm_add_ps(input, excess);
253 _mm_store_ps(outPtr, output);
259 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
264#include <xmmintrin.h>
267 const float* inputVector,
268 const float lower_bound,
269 const float upper_bound,
270 unsigned int num_points)
272 const __m128 lower = _mm_set_ps1(lower_bound);
273 const __m128 upper = _mm_set_ps1(upper_bound);
274 const __m128 distance = _mm_sub_ps(upper, lower);
275 __m128 input, output;
276 __m128 is_smaller, is_bigger;
280 const float* inPtr = inputVector;
281 float* outPtr = outputVector;
282 const size_t quarter_points = num_points / 4;
283 for (
size_t counter = 0; counter < quarter_points; counter++) {
284 input = _mm_loadu_ps(inPtr);
286 is_smaller = _mm_cmplt_ps(input, lower);
287 is_bigger = _mm_cmpgt_ps(input, upper);
289 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
292 excess = _mm_div_ps(excess, distance);
294 rounddown = _mm_cvttps_epi32(excess);
295 excess = _mm_cvtepi32_ps(rounddown);
297 adj = _mm_set_ps1(1.0f);
298 excess = _mm_add_ps(excess, adj);
300 adj = _mm_and_ps(adj, is_smaller);
301 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
303 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304 output = _mm_add_ps(input, excess);
305 _mm_storeu_ps(outPtr, output);
311 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
314 const float* inputVector,
315 const float lower_bound,
316 const float upper_bound,
317 unsigned int num_points)
319 const __m128 lower = _mm_set_ps1(lower_bound);
320 const __m128 upper = _mm_set_ps1(upper_bound);
321 const __m128 distance = _mm_sub_ps(upper, lower);
322 __m128 input, output;
323 __m128 is_smaller, is_bigger;
327 const float* inPtr = inputVector;
328 float* outPtr = outputVector;
329 const size_t quarter_points = num_points / 4;
330 for (
size_t counter = 0; counter < quarter_points; counter++) {
331 input = _mm_load_ps(inPtr);
333 is_smaller = _mm_cmplt_ps(input, lower);
334 is_bigger = _mm_cmpgt_ps(input, upper);
336 excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337 excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
339 excess = _mm_div_ps(excess, distance);
341 rounddown = _mm_cvttps_epi32(excess);
342 excess = _mm_cvtepi32_ps(rounddown);
344 adj = _mm_set_ps1(1.0f);
345 excess = _mm_add_ps(excess, adj);
347 adj = _mm_and_ps(adj, is_smaller);
348 adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
350 excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351 output = _mm_add_ps(input, excess);
352 _mm_store_ps(outPtr, output);
358 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
363#include <riscv_vector.h>
365static inline void volk_32f_s32f_s32f_mod_range_32f_rvv(
float* outputVector,
366 const float* inputVector,
367 const float lower_bound,
368 const float upper_bound,
369 unsigned int num_points)
371 const float dist = upper_bound - lower_bound;
372 size_t vlmax = __riscv_vsetvlmax_e32m4();
373 vfloat32m4_t vdist = __riscv_vfmv_v_f_f32m4(dist, vlmax);
374 vfloat32m4_t vmdist = __riscv_vfmv_v_f_f32m4(-dist, vlmax);
375 vfloat32m4_t vupper = __riscv_vfmv_v_f_f32m4(upper_bound, vlmax);
376 vfloat32m4_t vlower = __riscv_vfmv_v_f_f32m4(lower_bound, vlmax);
377 size_t n = num_points;
378 for (
size_t vl; n > 0; n -= vl, outputVector += vl, inputVector += vl) {
379 vl = __riscv_vsetvl_e32m4(n);
380 vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl);
381 vfloat32m4_t vlt = __riscv_vfsub(vlower, v, vl);
382 vfloat32m4_t vgt = __riscv_vfsub(v, vupper, vl);
383 vbool8_t mlt = __riscv_vmflt(v, vlower, vl);
384 vfloat32m4_t vmul = __riscv_vmerge(vmdist, vdist, mlt, vl);
385 vfloat32m4_t vcnt = __riscv_vfdiv(__riscv_vmerge(vgt, vlt, mlt, vl), vdist, vl);
386 vcnt = __riscv_vfcvt_f(__riscv_vadd(__riscv_vfcvt_rtz_x(vcnt, vl), 1, vl), vl);
387 vbool8_t mgt = __riscv_vmfgt(v, vupper, vl);
388 v = __riscv_vfmacc_mu(__riscv_vmor(mlt, mgt, vl), v, vcnt, vmul, vl);
390 __riscv_vse32(outputVector, v, vl);
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:266
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:116
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:313
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:170
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:37
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:68
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition volk_32f_s32f_s32f_mod_range_32f.h:215