Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
45
46#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
47#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
48
49#include <inttypes.h>
50
51#ifdef LV_HAVE_SSE2
52
53#include <emmintrin.h>
54
55static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
56 short* src0,
57 short* src1,
58 short* src2,
59 short* src3,
60 unsigned int num_points)
61{
62 const unsigned int num_bytes = num_points * 2;
63
64 int i = 0;
65
66 int bound = (num_bytes >> 4);
67 int bound_copy = bound;
68 int leftovers = (num_bytes >> 1) & 7;
69
70 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
71 p_target = (__m128i*)target;
72 p_src0 = (__m128i*)src0;
73 p_src1 = (__m128i*)src1;
74 p_src2 = (__m128i*)src2;
75 p_src3 = (__m128i*)src3;
76
77 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
78
79 while (bound_copy > 0) {
80 xmm1 = _mm_load_si128(p_src0);
81 xmm2 = _mm_load_si128(p_src1);
82 xmm3 = _mm_load_si128(p_src2);
83 xmm4 = _mm_load_si128(p_src3);
84
85 xmm5 = _mm_setzero_si128();
86 xmm6 = _mm_setzero_si128();
87 xmm7 = xmm1;
88 xmm8 = xmm3;
89
90 xmm1 = _mm_sub_epi16(xmm2, xmm1);
91
92 xmm3 = _mm_sub_epi16(xmm4, xmm3);
93
94 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
95 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
96
97 xmm2 = _mm_and_si128(xmm5, xmm2);
98 xmm4 = _mm_and_si128(xmm6, xmm4);
99 xmm5 = _mm_andnot_si128(xmm5, xmm7);
100 xmm6 = _mm_andnot_si128(xmm6, xmm8);
101
102 xmm5 = _mm_add_epi16(xmm2, xmm5);
103 xmm6 = _mm_add_epi16(xmm4, xmm6);
104
105 xmm1 = _mm_xor_si128(xmm1, xmm1);
106 xmm2 = xmm5;
107 xmm5 = _mm_sub_epi16(xmm6, xmm5);
108 p_src0 += 1;
109 bound_copy -= 1;
110
111 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
112 p_src1 += 1;
113
114 xmm6 = _mm_and_si128(xmm1, xmm6);
115
116 xmm1 = _mm_andnot_si128(xmm1, xmm2);
117 p_src2 += 1;
118
119 xmm1 = _mm_add_epi16(xmm6, xmm1);
120 p_src3 += 1;
121
122 _mm_store_si128(p_target, xmm1);
123 p_target += 1;
124 }
125
126 short temp0 = 0;
127 short temp1 = 0;
128 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
129 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
130 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
131 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
132 }
133 return;
134}
135
136#endif /*LV_HAVE_SSE2*/
137
138#ifdef LV_HAVE_NEON
139
140#include <arm_neon.h>
141
142static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
143 short* src0,
144 short* src1,
145 short* src2,
146 short* src3,
147 unsigned int num_points)
148{
149 const unsigned int eighth_points = num_points / 8;
150 unsigned i;
151
152 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
153 int16x8_t diff12, diff34;
154 int16x8_t comp0, comp1, comp2, comp3;
155 int16x8_t result1_vec, result2_vec;
156 int16x8_t zeros;
157 zeros = vdupq_n_s16(0);
158 for (i = 0; i < eighth_points; ++i) {
159 src0_vec = vld1q_s16(src0);
160 src1_vec = vld1q_s16(src1);
161 src2_vec = vld1q_s16(src2);
162 src3_vec = vld1q_s16(src3);
163 diff12 = vsubq_s16(src0_vec, src1_vec);
164 diff34 = vsubq_s16(src2_vec, src3_vec);
165 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
166 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
167 comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
168 comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
169 comp0 = vandq_s16(src0_vec, comp0);
170 comp1 = vandq_s16(src1_vec, comp1);
171 comp2 = vandq_s16(src2_vec, comp2);
172 comp3 = vandq_s16(src3_vec, comp3);
173
174 result1_vec = vaddq_s16(comp0, comp1);
175 result2_vec = vaddq_s16(comp2, comp3);
176
177 diff12 = vsubq_s16(result1_vec, result2_vec);
178 comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
179 comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
180 comp0 = vandq_s16(result1_vec, comp0);
181 comp1 = vandq_s16(result2_vec, comp1);
182 result1_vec = vaddq_s16(comp0, comp1);
183 vst1q_s16(target, result1_vec);
184 src0 += 8;
185 src1 += 8;
186 src2 += 8;
187 src3 += 8;
188 target += 8;
189 }
190
191 short temp0 = 0;
192 short temp1 = 0;
193 for (i = eighth_points * 8; i < num_points; ++i) {
194 temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
195 temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
196 *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
197 src0++;
198 src1++;
199 src2++;
200 src3++;
201 }
202}
203#endif /* LV_HAVE_NEON */
204
205
206#ifdef LV_HAVE_GENERIC
207static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
208 short* src0,
209 short* src1,
210 short* src2,
211 short* src3,
212 unsigned int num_points)
213{
214 const unsigned int num_bytes = num_points * 2;
215
216 int i = 0;
217
218 int bound = num_bytes >> 1;
219
220 short temp0 = 0;
221 short temp1 = 0;
222 for (i = 0; i < bound; ++i) {
223 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
224 temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
225 target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
226 }
227}
228
229#endif /*LV_HAVE_GENERIC*/
230
231#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition volk_16i_x4_quad_max_star_16i.h:207
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition volk_16i_x4_quad_max_star_16i.h:142
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition volk_16i_x4_quad_max_star_16i.h:55
for i
Definition volk_config_fixed.tmpl.h:13