Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
44
45#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
46#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
47
48#include <volk/volk_common.h>
49
50#include <inttypes.h>
51#include <stdio.h>
52
53
54#ifdef LV_HAVE_SSSE3
55
56#include <emmintrin.h>
57#include <tmmintrin.h>
58#include <xmmintrin.h>
59
60static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
61 int16_t* src0,
62 unsigned int num_points)
63{
64 const unsigned int num_bytes = num_points * 2;
65
66 static const uint8_t shufmask0[16] = {
67 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
68 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
69 };
70 static const uint8_t shufmask1[16] = {
71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
73 };
74 static const uint8_t andmask0[16] = {
75 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
76 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
77 };
78 static const uint8_t andmask1[16] = {
79 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
81 };
82
83 __m128i xmm0 = {}, xmm1 = {}, xmm2 = {}, xmm3 = {}, xmm4 = {};
84 __m128i xmm5 = {}, xmm6 = {}, xmm7 = {}, xmm8 = {};
85
86 xmm4 = _mm_load_si128((__m128i*)shufmask0);
87 xmm5 = _mm_load_si128((__m128i*)shufmask1);
88 xmm6 = _mm_load_si128((__m128i*)andmask0);
89 xmm7 = _mm_load_si128((__m128i*)andmask1);
90
91 __m128i *p_target, *p_src0;
92
93 p_target = (__m128i*)target;
94 p_src0 = (__m128i*)src0;
95
96 int bound = num_bytes >> 5;
97 int intermediate = (num_bytes >> 4) & 1;
98 int leftovers = (num_bytes >> 1) & 7;
99
100 int i = 0;
101
102 for (i = 0; i < bound; ++i) {
103 xmm0 = _mm_load_si128(p_src0);
104 xmm1 = _mm_load_si128(&p_src0[1]);
105
106 xmm2 = _mm_xor_si128(xmm2, xmm2);
107 p_src0 += 2;
108
109 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
110
111 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
112
113 xmm8 = _mm_and_si128(xmm2, xmm6);
114 xmm3 = _mm_and_si128(xmm2, xmm7);
115
116
117 xmm8 = _mm_add_epi8(xmm8, xmm4);
118 xmm3 = _mm_add_epi8(xmm3, xmm5);
119
120 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
121 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
122
123
124 xmm3 = _mm_add_epi16(xmm0, xmm1);
125
126
127 _mm_store_si128(p_target, xmm3);
128
129 p_target += 1;
130 }
131
132 if (intermediate) {
133 xmm0 = _mm_load_si128(p_src0);
134
135 xmm2 = _mm_xor_si128(xmm2, xmm2);
136 p_src0 += 1;
137
138 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
139 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
140
141 xmm8 = _mm_and_si128(xmm2, xmm6);
142
143 xmm3 = _mm_add_epi8(xmm8, xmm4);
144
145 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
146
147 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
148
149 p_target = (__m128i*)((int8_t*)p_target + 8);
150 }
151
152 for (i = (bound << 4) + (intermediate << 3);
153 i < (bound << 4) + (intermediate << 3) + leftovers;
154 i += 2) {
155 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
156 }
157}
158
159#endif /*LV_HAVE_SSSE3*/
160
161#ifdef LV_HAVE_NEON
162
163#include <arm_neon.h>
164static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
165 int16_t* src0,
166 unsigned int num_points)
167{
168 const unsigned int eighth_points = num_points / 16;
169 unsigned number;
170 int16x8x2_t input_vec;
171 int16x8_t diff, max_vec, zeros;
172 uint16x8_t comp1, comp2;
173 zeros = vdupq_n_s16(0);
174 for (number = 0; number < eighth_points; ++number) {
175 input_vec = vld2q_s16(src0);
176 //__VOLK_PREFETCH(src0+16);
177 diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
178 comp1 = vcgeq_s16(diff, zeros);
179 comp2 = vcltq_s16(diff, zeros);
180
181 input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
182 input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
183
184 max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
185 vst1q_s16(target, max_vec);
186 src0 += 16;
187 target += 8;
188 }
189 for (number = 0; number < num_points % 16; number += 2) {
190 target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
191 ? src0[number]
192 : src0[number + 1];
193 }
194}
195#endif /* LV_HAVE_NEON */
196
197#ifdef LV_HAVE_NEONV7
198extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
199 int16_t* src0,
200 unsigned int num_points);
201#endif /* LV_HAVE_NEONV7 */
202
203#ifdef LV_HAVE_GENERIC
204static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
205 int16_t* src0,
206 unsigned int num_points)
207{
208 const unsigned int num_bytes = num_points * 2;
209
210 int i = 0;
211
212 int bound = num_bytes >> 1;
213
214 for (i = 0; i < bound; i += 2) {
215 target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
216 }
217}
218
219#endif /*LV_HAVE_GENERIC*/
220
221#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition volk_16i_max_star_horizontal_16i.h:164
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition volk_16i_max_star_horizontal_16i.h:60
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition volk_16i_max_star_horizontal_16i.h:204
#define bit128_p(x)
Definition volk_common.h:147
for i
Definition volk_config_fixed.tmpl.h:13