6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
11 #if (CRYPTOPP_ARM_NEON_HEADER)
13 # include <arm_neon.h>
16 #if (CRYPTOPP_ARM_ACLE_HEADER)
18 # include <arm_acle.h>
21 #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
30 inline uint32_t
CRC32B (uint32_t crc, uint8_t val)
33 return __crc32b(crc, val);
35 __asm__ (
"crc32b %w0, %w0, %w1 \n\t"
36 :
"+r" (crc) :
"r" (val) );
46 inline uint32_t
CRC32W (uint32_t crc, uint32_t val)
49 return __crc32w(crc, val);
51 __asm__ (
"crc32w %w0, %w0, %w1 \n\t"
52 :
"+r" (crc) :
"r" (val) );
62 inline uint32_t
CRC32Wx4 (uint32_t crc,
const uint32_t vals[4])
65 return __crc32w(__crc32w(__crc32w(__crc32w(
66 crc, vals[0]), vals[1]), vals[2]), vals[3]);
68 __asm__ (
"crc32w %w0, %w0, %w1 \n\t"
69 "crc32w %w0, %w0, %w2 \n\t"
70 "crc32w %w0, %w0, %w3 \n\t"
71 "crc32w %w0, %w0, %w4 \n\t"
72 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
73 "r" (vals[2]),
"r" (vals[3]));
86 inline uint32_t
CRC32CB (uint32_t crc, uint8_t val)
89 return __crc32cb(crc, val);
91 __asm__ (
"crc32cb %w0, %w0, %w1 \n\t"
92 :
"+r" (crc) :
"r" (val) );
102 inline uint32_t
CRC32CW (uint32_t crc, uint32_t val)
104 #if defined(_MSC_VER)
105 return __crc32cw(crc, val);
107 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t"
108 :
"+r" (crc) :
"r" (val) );
118 inline uint32_t
CRC32CWx4 (uint32_t crc,
const uint32_t vals[4])
120 #if defined(_MSC_VER)
121 return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
122 crc, vals[0]), vals[1]), vals[2]), vals[3]);
124 __asm__ (
"crc32cw %w0, %w0, %w1 \n\t"
125 "crc32cw %w0, %w0, %w2 \n\t"
126 "crc32cw %w0, %w0, %w3 \n\t"
127 "crc32cw %w0, %w0, %w4 \n\t"
128 :
"+r" (crc) :
"r" (vals[0]),
"r" (vals[1]),
129 "r" (vals[2]),
"r" (vals[3]));
134 #endif // CRYPTOPP_ARM_CRC32_AVAILABLE
136 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
152 inline uint64x2_t
PMULL_00(
const uint64x2_t a,
const uint64x2_t b)
154 #if defined(_MSC_VER)
155 const __n64 x = { vgetq_lane_u64(a, 0) };
156 const __n64 y = { vgetq_lane_u64(b, 0) };
157 return vmull_p64(x, y);
158 #elif defined(__GNUC__)
160 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
161 :
"=w" (r) :
"w" (a),
"w" (b) );
164 return (uint64x2_t)(vmull_p64(
165 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
166 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
182 inline uint64x2_t
PMULL_01(
const uint64x2_t a,
const uint64x2_t b)
184 #if defined(_MSC_VER)
185 const __n64 x = { vgetq_lane_u64(a, 0) };
186 const __n64 y = { vgetq_lane_u64(b, 1) };
187 return vmull_p64(x, y);
188 #elif defined(__GNUC__)
190 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
191 :
"=w" (r) :
"w" (a),
"w" (vget_high_u64(b)) );
194 return (uint64x2_t)(vmull_p64(
195 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
196 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
212 inline uint64x2_t
PMULL_10(
const uint64x2_t a,
const uint64x2_t b)
214 #if defined(_MSC_VER)
215 const __n64 x = { vgetq_lane_u64(a, 1) };
216 const __n64 y = { vgetq_lane_u64(b, 0) };
217 return vmull_p64(x, y);
218 #elif defined(__GNUC__)
220 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
221 :
"=w" (r) :
"w" (vget_high_u64(a)),
"w" (b) );
224 return (uint64x2_t)(vmull_p64(
225 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
226 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
242 inline uint64x2_t
PMULL_11(
const uint64x2_t a,
const uint64x2_t b)
244 #if defined(_MSC_VER)
245 const __n64 x = { vgetq_lane_u64(a, 1) };
246 const __n64 y = { vgetq_lane_u64(b, 1) };
247 return vmull_p64(x, y);
248 #elif defined(__GNUC__)
250 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t"
251 :
"=w" (r) :
"w" (a),
"w" (b) );
254 return (uint64x2_t)(vmull_p64(
255 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
256 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
267 inline uint64x2_t
PMULL(
const uint64x2_t a,
const uint64x2_t b)
269 #if defined(_MSC_VER)
270 const __n64 x = { vgetq_lane_u64(a, 0) };
271 const __n64 y = { vgetq_lane_u64(b, 0) };
272 return vmull_p64(x, y);
273 #elif defined(__GNUC__)
275 __asm__ (
"pmull %0.1q, %1.1d, %2.1d \n\t"
276 :
"=w" (r) :
"w" (a),
"w" (b) );
279 return (uint64x2_t)(vmull_p64(
280 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
281 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
292 inline uint64x2_t
PMULL_HIGH(
const uint64x2_t a,
const uint64x2_t b)
294 #if defined(_MSC_VER)
295 const __n64 x = { vgetq_lane_u64(a, 1) };
296 const __n64 y = { vgetq_lane_u64(b, 1) };
297 return vmull_p64(x, y);
298 #elif defined(__GNUC__)
300 __asm__ (
"pmull2 %0.1q, %1.2d, %2.2d \n\t"
301 :
"=w" (r) :
"w" (a),
"w" (b) );
304 return (uint64x2_t)(vmull_p64(
305 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
306 vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
319 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b,
unsigned int c)
321 #if defined(_MSC_VER)
322 return vreinterpretq_u64_u8(vextq_u8(
323 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
326 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
327 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (c) );
341 template <
unsigned int C>
342 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b)
345 #if defined(_MSC_VER)
346 return vreinterpretq_u64_u8(vextq_u8(
347 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
350 __asm__ (
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
351 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C) );
357 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
359 #if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
372 inline uint64x2_t
VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
374 #if defined(_MSC_VER)
375 return veor3q_u64(a, b, c);
378 __asm__ (
"eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
379 :
"=w" (r) :
"w" (a),
"w" (b),
"w" (c));
393 inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b,
const int imm6)
395 #if defined(_MSC_VER)
396 return vxarq_u64(a, b, imm6);
399 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t"
400 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (imm6));
414 template <
unsigned int C>
415 inline uint64x2_t
VXAR(uint64x2_t a, uint64x2_t b)
417 #if defined(_MSC_VER)
418 return vxarq_u64(a, b, C);
421 __asm__ (
"xar %0.2d, %1.2d, %2.2d, %3 \n\t"
422 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C));
435 inline uint64x2_t
VRAX1(uint64x2_t a, uint64x2_t b)
437 #if defined(_MSC_VER)
438 return vrax1q_u64(a, b);
441 __asm__ (
"rax1 %0.2d, %1.2d, %2.2d \n\t"
442 :
"=w" (r) :
"w" (a),
"w" (b));
447 #endif // CRYPTOPP_ARM_SHA3_AVAILABLE
449 #endif // CRYPTOPP_ARM_SIMD_H