Vector Optimized Library of Kernels 3.2.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
50
51#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
52#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
53
54#include <inttypes.h>
55#include <stdio.h>
56
57#ifdef LV_HAVE_SSE2
58#include <emmintrin.h>
59#include <xmmintrin.h>
60
61static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
62 short* target1,
63 short* target2,
64 short* target3,
65 short* src0,
66 short* src1,
67 short* src2,
68 short* src3,
69 short* src4,
70 unsigned int num_points)
71{
72 const unsigned int num_bytes = num_points * 2;
73
74 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
75 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
76 *p_src3, *p_src4;
77 p_target0 = (__m128i*)target0;
78 p_target1 = (__m128i*)target1;
79 p_target2 = (__m128i*)target2;
80 p_target3 = (__m128i*)target3;
81
82 p_src0 = (__m128i*)src0;
83 p_src1 = (__m128i*)src1;
84 p_src2 = (__m128i*)src2;
85 p_src3 = (__m128i*)src3;
86 p_src4 = (__m128i*)src4;
87
88 int i = 0;
89
90 int bound = (num_bytes >> 4);
91 int leftovers = (num_bytes >> 1) & 7;
92
93 for (; i < bound; ++i) {
94 xmm0 = _mm_load_si128(p_src0);
95 xmm1 = _mm_load_si128(p_src1);
96 xmm2 = _mm_load_si128(p_src2);
97 xmm3 = _mm_load_si128(p_src3);
98 xmm4 = _mm_load_si128(p_src4);
99
100 p_src0 += 1;
101 p_src1 += 1;
102
103 xmm1 = _mm_add_epi16(xmm0, xmm1);
104 xmm2 = _mm_add_epi16(xmm0, xmm2);
105 xmm3 = _mm_add_epi16(xmm0, xmm3);
106 xmm4 = _mm_add_epi16(xmm0, xmm4);
107
108
109 p_src2 += 1;
110 p_src3 += 1;
111 p_src4 += 1;
112
113 _mm_store_si128(p_target0, xmm1);
114 _mm_store_si128(p_target1, xmm2);
115 _mm_store_si128(p_target2, xmm3);
116 _mm_store_si128(p_target3, xmm4);
117
118 p_target0 += 1;
119 p_target1 += 1;
120 p_target2 += 1;
121 p_target3 += 1;
122 }
123
124 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
125 target0[i] = src0[i] + src1[i];
126 target1[i] = src0[i] + src2[i];
127 target2[i] = src0[i] + src3[i];
128 target3[i] = src0[i] + src4[i];
129 }
130}
131#endif /*LV_HAVE_SSE2*/
132
133#ifdef LV_HAVE_NEON
134#include <arm_neon.h>
135
136static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
137 short* target1,
138 short* target2,
139 short* target3,
140 short* src0,
141 short* src1,
142 short* src2,
143 short* src3,
144 short* src4,
145 unsigned int num_points)
146{
147 const unsigned int eighth_points = num_points / 8;
148 unsigned int number = 0;
149
150 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
151 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
152 for (number = 0; number < eighth_points; ++number) {
153 src0_vec = vld1q_s16(src0);
154 src1_vec = vld1q_s16(src1);
155 src2_vec = vld1q_s16(src2);
156 src3_vec = vld1q_s16(src3);
157 src4_vec = vld1q_s16(src4);
158
159 target0_vec = vaddq_s16(src0_vec, src1_vec);
160 target1_vec = vaddq_s16(src0_vec, src2_vec);
161 target2_vec = vaddq_s16(src0_vec, src3_vec);
162 target3_vec = vaddq_s16(src0_vec, src4_vec);
163
164 vst1q_s16(target0, target0_vec);
165 vst1q_s16(target1, target1_vec);
166 vst1q_s16(target2, target2_vec);
167 vst1q_s16(target3, target3_vec);
168 src0 += 8;
169 src1 += 8;
170 src2 += 8;
171 src3 += 8;
172 src4 += 8;
173 target0 += 8;
174 target1 += 8;
175 target2 += 8;
176 target3 += 8;
177 }
178
179 for (number = eighth_points * 8; number < num_points; ++number) {
180 *target0++ = *src0 + *src1++;
181 *target1++ = *src0 + *src2++;
182 *target2++ = *src0 + *src3++;
183 *target3++ = *src0++ + *src4++;
184 }
185}
186
187#endif /* LV_HAVE_NEON */
188
189#ifdef LV_HAVE_GENERIC
190
191static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
192 short* target1,
193 short* target2,
194 short* target3,
195 short* src0,
196 short* src1,
197 short* src2,
198 short* src3,
199 short* src4,
200 unsigned int num_points)
201{
202 const unsigned int num_bytes = num_points * 2;
203
204 int i = 0;
205
206 int bound = num_bytes >> 1;
207
208 for (i = 0; i < bound; ++i) {
209 target0[i] = src0[i] + src1[i];
210 target1[i] = src0[i] + src2[i];
211 target2[i] = src0[i] + src3[i];
212 target3[i] = src0[i] + src4[i];
213 }
214}
215
216#endif /* LV_HAVE_GENERIC */
217
218#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition volk_16i_x5_add_quad_16i_x4.h:61
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition volk_16i_x5_add_quad_16i_x4.h:136
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition volk_16i_x5_add_quad_16i_x4.h:191
for i
Definition volk_config_fixed.tmpl.h:13