NCBI C++ ToolKit
sse2neon.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3 
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
21 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 // Danila Kutenin <danilak@google.com>
23 // Fran├žois Turban (JishinMaster) <francois.turban@gmail.com>
24 // Pei-Hsuan Hung <afcidk@gmail.com>
25 // Yang-Hao Yuan <yanghau@biilabs.io>
26 // Syoyo Fujita <syoyo@lighttransport.com>
27 // Brecht Van Lommel <brecht@blender.org>
28 
29 /*
30  * sse2neon is freely redistributable under the MIT License.
31  *
32  * Permission is hereby granted, free of charge, to any person obtaining a copy
33  * of this software and associated documentation files (the "Software"), to deal
34  * in the Software without restriction, including without limitation the rights
35  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36  * copies of the Software, and to permit persons to whom the Software is
37  * furnished to do so, subject to the following conditions:
38  *
39  * The above copyright notice and this permission notice shall be included in
40  * all copies or substantial portions of the Software.
41  *
42  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48  * SOFTWARE.
49  */
50 
51 /* Tunable configurations */
52 
53 /* Enable precise implementation of math operations
54  * This would slow down the computation a bit, but gives consistent result with
55  * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
56  */
57 /* _mm_min|max_ps|ss|pd|sd */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
60 #endif
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
64 #endif
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
68 #endif
69 /* _mm_dp_pd */
70 #ifndef SSE2NEON_PRECISE_DP
71 #define SSE2NEON_PRECISE_DP (0)
72 #endif
73 
74 /* compiler specific definitions */
75 #if defined(__GNUC__) || defined(__clang__)
76 #pragma push_macro("FORCE_INLINE")
77 #pragma push_macro("ALIGN_STRUCT")
78 #define FORCE_INLINE static inline __attribute__((always_inline))
79 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
80 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
81 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
82 #else /* non-GNU / non-clang compilers */
83 #warning "Macro name collisions may happen with unsupported compiler."
84 #ifndef FORCE_INLINE
85 #define FORCE_INLINE static inline
86 #endif
87 #ifndef ALIGN_STRUCT
88 #define ALIGN_STRUCT(x) __declspec(align(x))
89 #endif
90 #define _sse2neon_likely(x) (x)
91 #define _sse2neon_unlikely(x) (x)
92 #endif
93 
94 #include <stdint.h>
95 #include <stdlib.h>
96 
97 /* Architecture-specific build options */
98 /* FIXME: #pragma GCC push_options is only available on GCC */
99 #if defined(__GNUC__)
100 #if defined(__arm__) && __ARM_ARCH == 7
101 /* According to ARM C Language Extensions Architecture specification,
102  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
103  * architecture supported.
104  */
105 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
106 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
107 #endif
108 #if !defined(__clang__)
109 #pragma GCC push_options
110 #pragma GCC target("fpu=neon")
111 #endif
112 #elif defined(__aarch64__)
113 #if !defined(__clang__)
114 #pragma GCC push_options
115 #pragma GCC target("+simd")
116 #endif
117 #else
118 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
119 #endif
120 #endif
121 
122 #include <arm_neon.h>
123 
124 /* Rounding functions require either Aarch64 instructions or libm failback */
125 #if !defined(__aarch64__)
126 #include <math.h>
127 #endif
128 
129 /* "__has_builtin" can be used to query support for built-in functions
130  * provided by gcc/clang and other compilers that support it.
131  */
132 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
133 /* Compatibility with gcc <= 9 */
134 #if defined(__GNUC__) && (__GNUC__ <= 9)
135 #define __has_builtin(x) HAS##x
136 #define HAS__builtin_popcount 1
137 #define HAS__builtin_popcountll 1
138 #else
139 #define __has_builtin(x) 0
140 #endif
141 #endif
142 
143 /**
144  * MACRO for shuffle parameter for _mm_shuffle_ps().
145  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
146  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
147  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
148  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
149  * fp0 is the same for fp0 of result.
150  */
151 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
152  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
153 
154 /* Rounding mode macros. */
155 #define _MM_FROUND_TO_NEAREST_INT 0x00
156 #define _MM_FROUND_TO_NEG_INF 0x01
157 #define _MM_FROUND_TO_POS_INF 0x02
158 #define _MM_FROUND_TO_ZERO 0x03
159 #define _MM_FROUND_CUR_DIRECTION 0x04
160 #define _MM_FROUND_NO_EXC 0x08
161 #define _MM_FROUND_RAISE_EXC 0x00
162 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
163 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
164 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
165 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
166 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
167 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
168 #define _MM_ROUND_NEAREST 0x0000
169 #define _MM_ROUND_DOWN 0x2000
170 #define _MM_ROUND_UP 0x4000
171 #define _MM_ROUND_TOWARD_ZERO 0x6000
172 /* Flush zero mode macros. */
173 #define _MM_FLUSH_ZERO_MASK 0x8000
174 #define _MM_FLUSH_ZERO_ON 0x8000
175 #define _MM_FLUSH_ZERO_OFF 0x0000
176 /* Denormals are zeros mode macros. */
177 #define _MM_DENORMALS_ZERO_MASK 0x0040
178 #define _MM_DENORMALS_ZERO_ON 0x0040
179 #define _MM_DENORMALS_ZERO_OFF 0x0000
180 
181 /* indicate immediate constant argument in a given range */
182 #define __constrange(a, b) const
183 
184 /* A few intrinsics accept traditional data types like ints or floats, but
185  * most operate on data types that are specific to SSE.
186  * If a vector type ends in d, it contains doubles, and if it does not have
187  * a suffix, it contains floats. An integer vector type can contain any type
188  * of integer, from chars to shorts to unsigned long longs.
189  */
190 typedef int64x1_t __m64;
191 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
192 // On ARM 32-bit architecture, the float64x2_t is not supported.
193 // The data type __m128d should be represented in a different way for related
194 // intrinsic conversion.
195 #if defined(__aarch64__)
196 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
197 #else
198 typedef float32x4_t __m128d;
199 #endif
200 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
201 
202 // __int64 is defined in the Intrinsics Guide which maps to different datatype
203 // in different data model
204 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
205 #if (defined(__x86_64__) || defined(__i386__))
206 #define __int64 long long
207 #else
208 #define __int64 int64_t
209 #endif
210 #endif
211 
212 /* type-safe casting between types */
213 
214 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
215 #define vreinterpretq_m128_f32(x) (x)
216 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
217 
218 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
219 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
220 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
221 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
222 
223 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
224 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
225 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
226 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
227 
228 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
229 #define vreinterpretq_f32_m128(x) (x)
230 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
231 
232 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
233 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
234 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
235 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
236 
237 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
238 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
239 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
240 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
241 
242 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
243 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
244 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
245 #define vreinterpretq_m128i_s64(x) (x)
246 
247 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
248 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
249 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
250 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
251 
252 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
253 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
254 
255 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
256 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
257 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
258 #define vreinterpretq_s64_m128i(x) (x)
259 
260 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
261 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
262 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
263 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
264 
265 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
266 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
267 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
268 #define vreinterpret_m64_s64(x) (x)
269 
270 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
271 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
272 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
273 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
274 
275 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
276 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
277 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
278 
279 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
280 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
281 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
282 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
283 
284 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
285 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
286 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
287 #define vreinterpret_s64_m64(x) (x)
288 
289 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
290 
291 #if defined(__aarch64__)
292 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
293 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
294 
295 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
296 
297 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
298 #define vreinterpretq_m128d_f64(x) (x)
299 
300 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
301 
302 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
303 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
304 
305 #define vreinterpretq_f64_m128d(x) (x)
306 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
307 #else
308 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
309 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
310 
311 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
312 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
313 
314 #define vreinterpretq_m128d_f32(x) (x)
315 
316 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
317 
318 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
319 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
320 
321 #define vreinterpretq_f32_m128d(x) (x)
322 #endif
323 
324 // A struct is defined in this header file called 'SIMDVec' which can be used
325 // by applications which attempt to access the contents of an __m128 struct
326 // directly. It is important to note that accessing the __m128 struct directly
327 // is bad coding practice by Microsoft: @see:
328 // https://docs.microsoft.com/en-us/cpp/cpp/m128
329 //
330 // However, some legacy source code may try to access the contents of an __m128
331 // struct directly so the developer can use the SIMDVec as an alias for it. Any
332 // casting must be done manually by the developer, as you cannot cast or
333 // otherwise alias the base NEON data type for intrinsic operations.
334 //
335 // union intended to allow direct access to an __m128 variable using the names
336 // that the MSVC compiler provides. This union should really only be used when
337 // trying to access the members of the vector as integer values. GCC/clang
338 // allow native access to the float members through a simple array access
339 // operator (in C since 4.6, in C++ since 4.8).
340 //
341 // Ideally direct accesses to SIMD vectors should not be used since it can cause
342 // a performance hit. If it really is needed however, the original __m128
343 // variable can be aliased with a pointer to this union and used to access
344 // individual components. The use of this union should be hidden behind a macro
345 // that is used throughout the codebase to access the members instead of always
346 // declaring this type of variable.
347 typedef union ALIGN_STRUCT(16) SIMDVec {
348  float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
349  int8_t m128_i8[16]; // as signed 8-bit integers.
350  int16_t m128_i16[8]; // as signed 16-bit integers.
351  int32_t m128_i32[4]; // as signed 32-bit integers.
352  int64_t m128_i64[2]; // as signed 64-bit integers.
353  uint8_t m128_u8[16]; // as unsigned 8-bit integers.
354  uint16_t m128_u16[8]; // as unsigned 16-bit integers.
355  uint32_t m128_u32[4]; // as unsigned 32-bit integers.
356  uint64_t m128_u64[2]; // as unsigned 64-bit integers.
358 
359 // casting using SIMDVec
360 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
361 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
362 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
363 
364 /* SSE macros */
365 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
366 #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
367 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
368 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
369 
370 // Function declaration
371 // SSE
372 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
377 // SSE2
384 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
386 FORCE_INLINE __m128d _mm_set_pd(double, double);
389 // SSE4.1
396 // SSE4.2
398 
399 /* Backwards compatibility for compilers with lack of specific type support */
400 
401 // Older gcc does not define vld1q_u8_x4 type
402 #if defined(__GNUC__) && !defined(__clang__) && \
403  ((__GNUC__ <= 10 && defined(__arm__)) || \
404  (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
405  (__GNUC__ <= 9 && defined(__aarch64__)))
406 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
407 {
408  uint8x16x4_t ret;
409  ret.val[0] = vld1q_u8(p + 0);
410  ret.val[1] = vld1q_u8(p + 16);
411  ret.val[2] = vld1q_u8(p + 32);
412  ret.val[3] = vld1q_u8(p + 48);
413  return ret;
414 }
415 #else
416 // Wraps vld1q_u8_x4
418 {
419  return vld1q_u8_x4(p);
420 }
421 #endif
422 
423 /* Function Naming Conventions
424  * The naming convention of SSE intrinsics is straightforward. A generic SSE
425  * intrinsic function is given as follows:
426  * _mm_<name>_<data_type>
427  *
428  * The parts of this format are given as follows:
429  * 1. <name> describes the operation performed by the intrinsic
430  * 2. <data_type> identifies the data type of the function's primary arguments
431  *
432  * This last part, <data_type>, is a little complicated. It identifies the
433  * content of the input values, and can be set to any of the following values:
434  * + ps - vectors contain floats (ps stands for packed single-precision)
435  * + pd - vectors cantain doubles (pd stands for packed double-precision)
436  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
437  * signed integers
438  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
439  * unsigned integers
440  * + si128 - unspecified 128-bit vector or 256-bit vector
441  * + m128/m128i/m128d - identifies input vector types when they are different
442  * than the type of the returned vector
443  *
444  * For example, _mm_setzero_ps. The _mm implies that the function returns
445  * a 128-bit vector. The _ps at the end implies that the argument vectors
446  * contain floats.
447  *
448  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
449  * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
450  * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
451  * // Set packed 8-bit integers
452  * // 128 bits, 16 chars, per 8 bits
453  * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
454  * 4, 5, 12, 13, 6, 7, 14, 15);
455  * // Shuffle packed 8-bit integers
456  * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
457  *
458  * Data (Number, Binary, Byte Index):
459  +------+------+-------------+------+------+-------------+
460  | 1 | 2 | 3 | 4 | Number
461  +------+------+------+------+------+------+------+------+
462  | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
463  +------+------+------+------+------+------+------+------+
464  | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
465  +------+------+------+------+------+------+------+------+
466 
467  +------+------+------+------+------+------+------+------+
468  | 5 | 6 | 7 | 8 | Number
469  +------+------+------+------+------+------+------+------+
470  | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
471  +------+------+------+------+------+------+------+------+
472  | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
473  +------+------+------+------+------+------+------+------+
474  * Index (Byte Index):
475  +------+------+------+------+------+------+------+------+
476  | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
477  +------+------+------+------+------+------+------+------+
478 
479  +------+------+------+------+------+------+------+------+
480  | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
481  +------+------+------+------+------+------+------+------+
482  * Result:
483  +------+------+------+------+------+------+------+------+
484  | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
485  +------+------+------+------+------+------+------+------+
486  | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
487  +------+------+------+------+------+------+------+------+
488  | 256 | 2 | 5 | 6 | Number
489  +------+------+------+------+------+------+------+------+
490 
491  +------+------+------+------+------+------+------+------+
492  | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
493  +------+------+------+------+------+------+------+------+
494  | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
495  +------+------+------+------+------+------+------+------+
496  | 3 | 7 | 4 | 8 | Number
497  +------+------+------+------+------+------+-------------+
498  */
499 
500 /* Constants for use with _mm_prefetch. */
501 enum _mm_hint {
502  _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
503  _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
504  _MM_HINT_T1 = 2, /* load data to L2 cache only */
505  _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
506  _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
507  _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
508  _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
509  _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
510 };
511 
512 // The bit field mapping to the FPCR(floating-point control register)
513 typedef struct {
520 #if defined(__aarch64__)
521  uint32_t res3;
522 #endif
523 } fpcr_bitfield;
524 
525 // Takes the upper 64 bits of a and places it in the low end of the result
526 // Takes the lower 64 bits of b and places it into the high end of the result.
528 {
529  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
530  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
531  return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
532 }
533 
534 // takes the lower two 32-bit values from a and swaps them and places in high
535 // end of result takes the higher two 32 bit values from b and swaps them and
536 // places in low end of result.
538 {
539  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
540  float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
541  return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
542 }
543 
545 {
546  float32x2_t a21 = vget_high_f32(
548  float32x2_t b03 = vget_low_f32(
550  return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
551 }
552 
554 {
555  float32x2_t a03 = vget_low_f32(
557  float32x2_t b21 = vget_high_f32(
559  return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
560 }
561 
563 {
564  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
565  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
566  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
567 }
568 
570 {
571  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
572  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
573  return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
574 }
575 
577 {
578  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
579  float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
580  return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
581 }
582 
583 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
584 // high
586 {
587  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
588  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
589  return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
590 }
591 
593 {
594  float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
595  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
596  return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
597 }
598 
600 {
601  float32x2_t a22 =
602  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
603  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
604  return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
605 }
606 
608 {
609  float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
610  float32x2_t b22 =
611  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
612  return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
613 }
614 
616 {
617  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
618  float32x2_t a22 =
619  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
620  float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
621  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
622  return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
623 }
624 
626 {
627  float32x2_t a33 =
628  vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
629  float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
630  return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
631 }
632 
634 {
635  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
636  float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
637  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
638  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
639  return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
640 }
641 
643 {
644  float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
645  float32_t b2 = vgetq_lane_f32(b, 2);
646  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
647  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
648  return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
649 }
650 
652 {
653  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
654  float32_t b2 = vgetq_lane_f32(b, 2);
655  float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
656  float32x2_t b20 = vset_lane_f32(b2, b00, 1);
657  return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
658 }
659 
660 // Kahan summation for accurate summation of floating-point numbers.
661 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
662 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
663 {
664  y -= *c;
665  float t = *sum + y;
666  *c = (t - *sum) - y;
667  *sum = t;
668 }
669 
670 #if defined(__ARM_FEATURE_CRYPTO)
671 // Wraps vmull_p64
672 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
673 {
674  poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
675  poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
676  return vreinterpretq_u64_p128(vmull_p64(a, b));
677 }
678 #else // ARMv7 polyfill
679 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
680 //
681 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
682 // 64-bit->128-bit polynomial multiply.
683 //
684 // It needs some work and is somewhat slow, but it is still faster than all
685 // known scalar methods.
686 //
687 // Algorithm adapted to C from
688 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
689 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
690 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
691 // (https://hal.inria.fr/hal-01506572)
692 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
693 {
694  poly8x8_t a = vreinterpret_p8_u64(_a);
695  poly8x8_t b = vreinterpret_p8_u64(_b);
696 
697  // Masks
698  uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
699  vcreate_u8(0x00000000ffffffff));
700  uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
701  vcreate_u8(0x0000000000000000));
702 
703  // Do the multiplies, rotating with vext to get all combinations
704  uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
705  uint8x16_t e =
706  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
707  uint8x16_t f =
708  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
709  uint8x16_t g =
710  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
711  uint8x16_t h =
712  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
713  uint8x16_t i =
714  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
715  uint8x16_t j =
716  vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
717  uint8x16_t k =
718  vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
719 
720  // Add cross products
721  uint8x16_t l = veorq_u8(e, f); // L = E + F
722  uint8x16_t m = veorq_u8(g, h); // M = G + H
723  uint8x16_t n = veorq_u8(i, j); // N = I + J
724 
725  // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
726  // instructions.
727 #if defined(__aarch64__)
728  uint8x16_t lm_p0 = vreinterpretq_u8_u64(
729  vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
730  uint8x16_t lm_p1 = vreinterpretq_u8_u64(
731  vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
732  uint8x16_t nk_p0 = vreinterpretq_u8_u64(
733  vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
734  uint8x16_t nk_p1 = vreinterpretq_u8_u64(
735  vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
736 #else
737  uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
738  uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
739  uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
740  uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
741 #endif
742  // t0 = (L) (P0 + P1) << 8
743  // t1 = (M) (P2 + P3) << 16
744  uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
745  uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
746  uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
747 
748  // t2 = (N) (P4 + P5) << 24
749  // t3 = (K) (P6 + P7) << 32
750  uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
751  uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
752  uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
753 
754  // De-interleave
755 #if defined(__aarch64__)
756  uint8x16_t t0 = vreinterpretq_u8_u64(
757  vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
758  uint8x16_t t1 = vreinterpretq_u8_u64(
759  vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
760  uint8x16_t t2 = vreinterpretq_u8_u64(
761  vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
762  uint8x16_t t3 = vreinterpretq_u8_u64(
763  vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
764 #else
765  uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
766  uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
767  uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
768  uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
769 #endif
770  // Shift the cross products
771  uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
772  uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
773  uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
774  uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
775 
776  // Accumulate the products
777  uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
778  uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
779  uint8x16_t mix = veorq_u8(d, cross1);
780  uint8x16_t r = veorq_u8(mix, cross2);
781  return vreinterpretq_u64_u8(r);
782 }
783 #endif // ARMv7 polyfill
784 
785 // C equivalent:
786 // __m128i _mm_shuffle_epi32_default(__m128i a,
787 // __constrange(0, 255) int imm) {
788 // __m128i ret;
789 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
790 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
791 // return ret;
792 // }
793 #define _mm_shuffle_epi32_default(a, imm) \
794  __extension__({ \
795  int32x4_t ret; \
796  ret = vmovq_n_s32( \
797  vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
798  ret = vsetq_lane_s32( \
799  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
800  ret, 1); \
801  ret = vsetq_lane_s32( \
802  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
803  ret, 2); \
804  ret = vsetq_lane_s32( \
805  vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
806  ret, 3); \
807  vreinterpretq_m128i_s32(ret); \
808  })
809 
810 // Takes the upper 64 bits of a and places it in the low end of the result
811 // Takes the lower 64 bits of a and places it into the high end of the result.
813 {
814  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
815  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
816  return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
817 }
818 
819 // takes the lower two 32-bit values from a and swaps them and places in low end
820 // of result takes the higher two 32 bit values from a and swaps them and places
821 // in high end of result.
823 {
824  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
825  int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
826  return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
827 }
828 
829 // rotates the least significant 32 bits into the most significant 32 bits, and
830 // shifts the rest down
832 {
835 }
836 
837 // rotates the most significant 32 bits into the least significant 32 bits, and
838 // shifts the rest up
840 {
843 }
844 
845 // gets the lower 64 bits of a, and places it in the upper 64 bits
846 // gets the lower 64 bits of a and places it in the lower 64 bits
848 {
849  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
850  return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
851 }
852 
853 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
854 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
856 {
857  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
858  int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
859  return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
860 }
861 
862 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
863 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
864 // places it in the lower 64 bits
866 {
867  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
868  return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
869 }
870 
872 {
873  int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
874  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
875  return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
876 }
877 
879 {
880  int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
881  int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
882  return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
883 }
884 
886 {
887  int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
888  int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
889  return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
890 }
891 
892 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
893 // int imm)
894 #if defined(__aarch64__)
895 #define _mm_shuffle_epi32_splat(a, imm) \
896  __extension__({ \
897  vreinterpretq_m128i_s32( \
898  vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
899  })
900 #else
901 #define _mm_shuffle_epi32_splat(a, imm) \
902  __extension__({ \
903  vreinterpretq_m128i_s32( \
904  vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
905  })
906 #endif
907 
908 // NEON does not support a general purpose permute intrinsic
909 // Selects four specific single-precision, floating-point values from a and b,
910 // based on the mask i.
911 //
912 // C equivalent:
913 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
914 // __constrange(0, 255) int imm) {
915 // __m128 ret;
916 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
917 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
918 // return ret;
919 // }
920 //
921 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
922 #define _mm_shuffle_ps_default(a, b, imm) \
923  __extension__({ \
924  float32x4_t ret; \
925  ret = vmovq_n_f32( \
926  vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
927  ret = vsetq_lane_f32( \
928  vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
929  ret, 1); \
930  ret = vsetq_lane_f32( \
931  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
932  ret, 2); \
933  ret = vsetq_lane_f32( \
934  vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
935  ret, 3); \
936  vreinterpretq_m128_f32(ret); \
937  })
938 
939 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
940 // by imm.
941 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
942 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
943 // __constrange(0,255) int
944 // imm)
945 #define _mm_shufflelo_epi16_function(a, imm) \
946  __extension__({ \
947  int16x8_t ret = vreinterpretq_s16_m128i(a); \
948  int16x4_t lowBits = vget_low_s16(ret); \
949  ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
950  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
951  1); \
952  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
953  2); \
954  ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
955  3); \
956  vreinterpretq_m128i_s16(ret); \
957  })
958 
959 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
960 // by imm.
961 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
962 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
963 // __constrange(0,255) int
964 // imm)
965 #define _mm_shufflehi_epi16_function(a, imm) \
966  __extension__({ \
967  int16x8_t ret = vreinterpretq_s16_m128i(a); \
968  int16x4_t highBits = vget_high_s16(ret); \
969  ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
970  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
971  5); \
972  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
973  6); \
974  ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
975  7); \
976  vreinterpretq_m128i_s16(ret); \
977  })
978 
979 /* MMX */
980 
981 //_mm_empty is a no-op on arm
982 FORCE_INLINE void _mm_empty(void) {}
983 
984 /* SSE */
985 
986 // Adds the four single-precision, floating-point values of a and b.
987 //
988 // r0 := a0 + b0
989 // r1 := a1 + b1
990 // r2 := a2 + b2
991 // r3 := a3 + b3
992 //
993 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
995 {
996  return vreinterpretq_m128_f32(
998 }
999 
1000 // adds the scalar single-precision floating point values of a and b.
1001 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
1003 {
1004  float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1005  float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1006  // the upper values in the result must be the remnants of <a>.
1007  return vreinterpretq_m128_f32(vaddq_f32(a, value));
1008 }
1009 
1010 // Computes the bitwise AND of the four single-precision, floating-point values
1011 // of a and b.
1012 //
1013 // r0 := a0 & b0
1014 // r1 := a1 & b1
1015 // r2 := a2 & b2
1016 // r3 := a3 & b3
1017 //
1018 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
1020 {
1021  return vreinterpretq_m128_s32(
1023 }
1024 
1025 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1026 // values of a and b.
1027 //
1028 // r0 := ~a0 & b0
1029 // r1 := ~a1 & b1
1030 // r2 := ~a2 & b2
1031 // r3 := ~a3 & b3
1032 //
1033 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
1035 {
1036  return vreinterpretq_m128_s32(
1037  vbicq_s32(vreinterpretq_s32_m128(b),
1038  vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1039 }
1040 
1041 // Average packed unsigned 16-bit integers in a and b, and store the results in
1042 // dst.
1043 //
1044 // FOR j := 0 to 3
1045 // i := j*16
1046 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1047 // ENDFOR
1048 //
1049 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
1051 {
1052  return vreinterpret_m64_u16(
1053  vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1054 }
1055 
1056 // Average packed unsigned 8-bit integers in a and b, and store the results in
1057 // dst.
1058 //
1059 // FOR j := 0 to 7
1060 // i := j*8
1061 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1062 // ENDFOR
1063 //
1064 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
1066 {
1067  return vreinterpret_m64_u8(
1068  vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1069 }
1070 
1071 // Compares for equality.
1072 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
1074 {
1075  return vreinterpretq_m128_u32(
1077 }
1078 
1079 // Compares for equality.
1080 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
1082 {
1083  return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1084 }
1085 
1086 // Compares for greater than or equal.
1087 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
1089 {
1090  return vreinterpretq_m128_u32(
1092 }
1093 
1094 // Compares for greater than or equal.
1095 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
1097 {
1098  return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1099 }
1100 
1101 // Compares for greater than.
1102 //
1103 // r0 := (a0 > b0) ? 0xffffffff : 0x0
1104 // r1 := (a1 > b1) ? 0xffffffff : 0x0
1105 // r2 := (a2 > b2) ? 0xffffffff : 0x0
1106 // r3 := (a3 > b3) ? 0xffffffff : 0x0
1107 //
1108 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
1110 {
1111  return vreinterpretq_m128_u32(
1113 }
1114 
1115 // Compares for greater than.
1116 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
1118 {
1119  return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1120 }
1121 
1122 // Compares for less than or equal.
1123 //
1124 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
1125 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
1126 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
1127 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
1128 //
1129 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
1131 {
1132  return vreinterpretq_m128_u32(
1134 }
1135 
1136 // Compares for less than or equal.
1137 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
1139 {
1140  return _mm_move_ss(a, _mm_cmple_ps(a, b));
1141 }
1142 
1143 // Compares for less than
1144 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
1146 {
1147  return vreinterpretq_m128_u32(
1149 }
1150 
1151 // Compares for less than
1152 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
1154 {
1155  return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1156 }
1157 
1158 // Compares for inequality.
1159 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
1161 {
1162  return vreinterpretq_m128_u32(vmvnq_u32(
1164 }
1165 
1166 // Compares for inequality.
1167 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
1169 {
1170  return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1171 }
1172 
1173 // Compares for not greater than or equal.
1174 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
1176 {
1177  return vreinterpretq_m128_u32(vmvnq_u32(
1179 }
1180 
1181 // Compares for not greater than or equal.
1182 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
1184 {
1185  return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1186 }
1187 
1188 // Compares for not greater than.
1189 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
1191 {
1192  return vreinterpretq_m128_u32(vmvnq_u32(
1194 }
1195 
1196 // Compares for not greater than.
1197 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1199 {
1200  return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1201 }
1202 
1203 // Compares for not less than or equal.
1204 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
1206 {
1207  return vreinterpretq_m128_u32(vmvnq_u32(
1209 }
1210 
1211 // Compares for not less than or equal.
1212 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
1214 {
1215  return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1216 }
1217 
1218 // Compares for not less than.
1219 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
1221 {
1222  return vreinterpretq_m128_u32(vmvnq_u32(
1224 }
1225 
1226 // Compares for not less than.
1227 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
1229 {
1230  return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1231 }
1232 
1233 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1234 // Ordered compare between each value returns true for "orderable" and false for
1235 // "not orderable" (NaN).
1236 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1237 // also:
1238 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1239 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1241 {
1242  // Note: NEON does not have ordered compare builtin
1243  // Need to compare a eq a and b eq b to check for NaN
1244  // Do AND of results to get final
1245  uint32x4_t ceqaa =
1247  uint32x4_t ceqbb =
1249  return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1250 }
1251 
1252 // Compares for ordered.
1253 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
1255 {
1256  return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1257 }
1258 
1259 // Compares for unordered.
1260 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
1262 {
1263  uint32x4_t f32a =
1265  uint32x4_t f32b =
1267  return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1268 }
1269 
1270 // Compares for unordered.
1271 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
1273 {
1274  return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1275 }
1276 
1277 // Compares the lower single-precision floating point scalar values of a and b
1278 // using an equality operation. :
1279 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
1281 {
1282  uint32x4_t a_eq_b =
1284  return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1285 }
1286 
1287 // Compares the lower single-precision floating point scalar values of a and b
1288 // using a greater than or equal operation. :
1289 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
1291 {
1292  uint32x4_t a_ge_b =
1294  return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1295 }
1296 
1297 // Compares the lower single-precision floating point scalar values of a and b
1298 // using a greater than operation. :
1299 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
1301 {
1302  uint32x4_t a_gt_b =
1304  return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1305 }
1306 
1307 // Compares the lower single-precision floating point scalar values of a and b
1308 // using a less than or equal operation. :
1309 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
1311 {
1312  uint32x4_t a_le_b =
1314  return vgetq_lane_u32(a_le_b, 0) & 0x1;
1315 }
1316 
1317 // Compares the lower single-precision floating point scalar values of a and b
1318 // using a less than operation. :
1319 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1320 // note!! The documentation on MSDN is incorrect! If either of the values is a
1321 // NAN the docs say you will get a one, but in fact, it will return a zero!!
1323 {
1324  uint32x4_t a_lt_b =
1326  return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1327 }
1328 
1329 // Compares the lower single-precision floating point scalar values of a and b
1330 // using an inequality operation. :
1331 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
1333 {
1334  return !_mm_comieq_ss(a, b);
1335 }
1336 
1337 // Convert packed signed 32-bit integers in b to packed single-precision
1338 // (32-bit) floating-point elements, store the results in the lower 2 elements
1339 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1340 // dst.
1341 //
1342 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1343 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1344 // dst[95:64] := a[95:64]
1345 // dst[127:96] := a[127:96]
1346 //
1347 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
1349 {
1350  return vreinterpretq_m128_f32(
1351  vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1352  vget_high_f32(vreinterpretq_f32_m128(a))));
1353 }
1354 
1355 // Convert packed single-precision (32-bit) floating-point elements in a to
1356 // packed 32-bit integers, and store the results in dst.
1357 //
1358 // FOR j := 0 to 1
1359 // i := 32*j
1360 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1361 // ENDFOR
1362 //
1363 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
1365 {
1366 #if defined(__aarch64__)
1367  return vreinterpret_m64_s32(
1368  vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1369 #else
1370  return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1372 #endif
1373 }
1374 
1375 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1376 // floating-point element, store the result in the lower element of dst, and
1377 // copy the upper 3 packed elements from a to the upper elements of dst.
1378 //
1379 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1380 // dst[127:32] := a[127:32]
1381 //
1382 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
1384 {
1385  return vreinterpretq_m128_f32(
1386  vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1387 }
1388 
1389 // Convert the lower single-precision (32-bit) floating-point element in a to a
1390 // 32-bit integer, and store the result in dst.
1391 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
1393 {
1394 #if defined(__aarch64__)
1395  return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1396  0);
1397 #else
1398  float32_t data = vgetq_lane_f32(
1400  return (int32_t) data;
1401 #endif
1402 }
1403 
1404 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1405 // floating-point elements, and store the results in dst.
1406 //
1407 // FOR j := 0 to 3
1408 // i := j*16
1409 // m := j*32
1410 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1411 // ENDFOR
1412 //
1413 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
1415 {
1416  return vreinterpretq_m128_f32(
1417  vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1418 }
1419 
1420 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1421 // floating-point elements, store the results in the lower 2 elements of dst,
1422 // and copy the upper 2 packed elements from a to the upper elements of dst.
1423 //
1424 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1425 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1426 // dst[95:64] := a[95:64]
1427 // dst[127:96] := a[127:96]
1428 //
1429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
1431 {
1432  return vreinterpretq_m128_f32(
1433  vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1434  vget_high_f32(vreinterpretq_f32_m128(a))));
1435 }
1436 
1437 // Convert packed signed 32-bit integers in a to packed single-precision
1438 // (32-bit) floating-point elements, store the results in the lower 2 elements
1439 // of dst, then covert the packed signed 32-bit integers in b to
1440 // single-precision (32-bit) floating-point element, and store the results in
1441 // the upper 2 elements of dst.
1442 //
1443 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1444 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1445 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1446 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1447 //
1448 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
1450 {
1451  return vreinterpretq_m128_f32(vcvtq_f32_s32(
1452  vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1453 }
1454 
1455 // Convert the lower packed 8-bit integers in a to packed single-precision
1456 // (32-bit) floating-point elements, and store the results in dst.
1457 //
1458 // FOR j := 0 to 3
1459 // i := j*8
1460 // m := j*32
1461 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1462 // ENDFOR
1463 //
1464 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
1466 {
1467  return vreinterpretq_m128_f32(vcvtq_f32_s32(
1468  vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1469 }
1470 
1471 // Convert packed single-precision (32-bit) floating-point elements in a to
1472 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1473 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1474 // 0x7FFFFFFF.
1475 //
1476 // FOR j := 0 to 3
1477 // i := 16*j
1478 // k := 32*j
1479 // IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
1480 // dst[i+15:i] := 0x7FFF
1481 // ELSE
1482 // dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
1483 // FI
1484 // ENDFOR
1485 //
1486 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
1488 {
1489  const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
1490  const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
1491  const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1492  const __m128i maxMask = _mm_castps_si128(
1493  _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
1494  const __m128i betweenMask = _mm_castps_si128(
1495  _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
1496  const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1497  _mm_setzero_si128());
1500  __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1501  __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1502  return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
1503 }
1504 
1505 // Convert packed single-precision (32-bit) floating-point elements in a to
1506 // packed 32-bit integers, and store the results in dst.
1507 //
1508 // FOR j := 0 to 1
1509 // i := 32*j
1510 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1511 // ENDFOR
1512 //
1513 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1514 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1515 
1516 // Convert packed single-precision (32-bit) floating-point elements in a to
1517 // packed 8-bit integers, and store the results in lower 4 elements of dst.
1518 // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1519 // between 0x7F and 0x7FFFFFFF.
1520 //
1521 // FOR j := 0 to 3
1522 // i := 8*j
1523 // k := 32*j
1524 // IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
1525 // dst[i+7:i] := 0x7F
1526 // ELSE
1527 // dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
1528 // FI
1529 // ENDFOR
1530 //
1531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
1533 {
1534  const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
1535  const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
1536  const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
1537  const __m128i maxMask = _mm_castps_si128(
1538  _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
1539  const __m128i betweenMask = _mm_castps_si128(
1540  _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
1541  const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
1542  _mm_setzero_si128());
1545  __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
1546  __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
1547  int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
1548  int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
1549  static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
1550  int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
1551 
1552  return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
1553 }
1554 
1555 // Convert packed unsigned 16-bit integers in a to packed single-precision
1556 // (32-bit) floating-point elements, and store the results in dst.
1557 //
1558 // FOR j := 0 to 3
1559 // i := j*16
1560 // m := j*32
1561 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1562 // ENDFOR
1563 //
1564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
1566 {
1567  return vreinterpretq_m128_f32(
1568  vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1569 }
1570 
1571 // Convert the lower packed unsigned 8-bit integers in a to packed
1572 // single-precision (32-bit) floating-point elements, and store the results in
1573 // dst.
1574 //
1575 // FOR j := 0 to 3
1576 // i := j*8
1577 // m := j*32
1578 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1579 // ENDFOR
1580 //
1581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
1583 {
1584  return vreinterpretq_m128_f32(vcvtq_f32_u32(
1585  vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1586 }
1587 
1588 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1589 // floating-point element, store the result in the lower element of dst, and
1590 // copy the upper 3 packed elements from a to the upper elements of dst.
1591 //
1592 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1593 // dst[127:32] := a[127:32]
1594 //
1595 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1596 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1597 
1598 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1599 // floating-point element, store the result in the lower element of dst, and
1600 // copy the upper 3 packed elements from a to the upper elements of dst.
1601 //
1602 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1603 // dst[127:32] := a[127:32]
1604 //
1605 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
1607 {
1608  return vreinterpretq_m128_f32(
1609  vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1610 }
1611 
1612 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1613 //
1614 // dst[31:0] := a[31:0]
1615 //
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
1618 {
1619  return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1620 }
1621 
1622 // Convert the lower single-precision (32-bit) floating-point element in a to a
1623 // 32-bit integer, and store the result in dst.
1624 //
1625 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1626 //
1627 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1628 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1629 
1630 // Convert the lower single-precision (32-bit) floating-point element in a to a
1631 // 64-bit integer, and store the result in dst.
1632 //
1633 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1634 //
1635 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
1637 {
1638 #if defined(__aarch64__)
1639  return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1640 #else
1641  float32_t data = vgetq_lane_f32(
1643  return (int64_t) data;
1644 #endif
1645 }
1646 
1647 // Convert packed single-precision (32-bit) floating-point elements in a to
1648 // packed 32-bit integers with truncation, and store the results in dst.
1649 //
1650 // FOR j := 0 to 1
1651 // i := 32*j
1652 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1653 // ENDFOR
1654 //
1655 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
1657 {
1658  return vreinterpret_m64_s32(
1659  vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1660 }
1661 
1662 // Convert the lower single-precision (32-bit) floating-point element in a to a
1663 // 32-bit integer with truncation, and store the result in dst.
1664 //
1665 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1666 //
1667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
1669 {
1670  return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1671 }
1672 
1673 // Convert packed single-precision (32-bit) floating-point elements in a to
1674 // packed 32-bit integers with truncation, and store the results in dst.
1675 //
1676 // FOR j := 0 to 1
1677 // i := 32*j
1678 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1679 // ENDFOR
1680 //
1681 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1682 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1683 
1684 // Convert the lower single-precision (32-bit) floating-point element in a to a
1685 // 32-bit integer with truncation, and store the result in dst.
1686 //
1687 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1688 //
1689 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1690 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1691 
1692 // Convert the lower single-precision (32-bit) floating-point element in a to a
1693 // 64-bit integer with truncation, and store the result in dst.
1694 //
1695 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1696 //
1697 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
1699 {
1700  return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1701 }
1702 
1703 // Divides the four single-precision, floating-point values of a and b.
1704 //
1705 // r0 := a0 / b0
1706 // r1 := a1 / b1
1707 // r2 := a2 / b2
1708 // r3 := a3 / b3
1709 //
1710 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
1712 {
1713 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1714  return vreinterpretq_m128_f32(
1716 #else
1717  float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1718  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1719 #if SSE2NEON_PRECISE_DIV
1720  // Additional Netwon-Raphson iteration for accuracy
1721  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1722 #endif
1723  return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1724 #endif
1725 }
1726 
1727 // Divides the scalar single-precision floating point value of a by b.
1728 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
1730 {
1731  float32_t value =
1732  vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1733  return vreinterpretq_m128_f32(
1734  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1735 }
1736 
1737 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1738 // the lower element of dst.
1739 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1740 #define _mm_extract_pi16(a, imm) \
1741  (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1742 
1743 // Free aligned memory that was allocated with _mm_malloc.
1744 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
1745 FORCE_INLINE void _mm_free(void *addr)
1746 {
1747  free(addr);
1748 }
1749 
1750 // Macro: Get the flush zero bits from the MXCSR control and status register.
1751 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1752 // _MM_FLUSH_ZERO_OFF
1753 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
1755 {
1756  union {
1757  fpcr_bitfield field;
1758 #if defined(__aarch64__)
1759  uint64_t value;
1760 #else
1761  uint32_t value;
1762 #endif
1763  } r;
1764 
1765 #if defined(__aarch64__)
1766  asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1767 #else
1768  asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1769 #endif
1770 
1771  return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1772 }
1773 
1774 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1775 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1776 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1777 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
1779 {
1780  union {
1781  fpcr_bitfield field;
1782 #if defined(__aarch64__)
1783  uint64_t value;
1784 #else
1785  uint32_t value;
1786 #endif
1787  } r;
1788 
1789 #if defined(__aarch64__)
1790  asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1791 #else
1792  asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1793 #endif
1794 
1795  if (r.field.bit22) {
1796  return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1797  } else {
1798  return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1799  }
1800 }
1801 
1802 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1803 // specified by imm8.
1804 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1805 #define _mm_insert_pi16(a, b, imm) \
1806  __extension__({ \
1807  vreinterpret_m64_s16( \
1808  vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1809  })
1810 
1811 // Loads four single-precision, floating-point values.
1812 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
1814 {
1815  return vreinterpretq_m128_f32(vld1q_f32(p));
1816 }
1817 
1818 // Load a single-precision (32-bit) floating-point element from memory into all
1819 // elements of dst.
1820 //
1821 // dst[31:0] := MEM[mem_addr+31:mem_addr]
1822 // dst[63:32] := MEM[mem_addr+31:mem_addr]
1823 // dst[95:64] := MEM[mem_addr+31:mem_addr]
1824 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1825 //
1826 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1827 #define _mm_load_ps1 _mm_load1_ps
1828 
1829 // Loads an single - precision, floating - point value into the low word and
1830 // clears the upper three words.
1831 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
1833 {
1834  return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1835 }
1836 
1837 // Loads a single single-precision, floating-point value, copying it into all
1838 // four words
1839 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
1841 {
1842  return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1843 }
1844 
1845 // Sets the upper two single-precision, floating-point values with 64
1846 // bits of data loaded from the address p; the lower two values are passed
1847 // through from a.
1848 //
1849 // r0 := a0
1850 // r1 := a1
1851 // r2 := *p0
1852 // r3 := *p1
1853 //
1854 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
1856 {
1857  return vreinterpretq_m128_f32(
1858  vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1859 }
1860 
1861 // Sets the lower two single-precision, floating-point values with 64
1862 // bits of data loaded from the address p; the upper two values are passed
1863 // through from a.
1864 //
1865 // Return Value
1866 // r0 := *p0
1867 // r1 := *p1
1868 // r2 := a2
1869 // r3 := a3
1870 //
1871 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
1873 {
1874  return vreinterpretq_m128_f32(
1875  vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1876 }
1877 
1878 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1879 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1880 // general-protection exception may be generated.
1881 //
1882 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1883 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1884 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1885 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1886 //
1887 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
1889 {
1890  float32x4_t v = vrev64q_f32(vld1q_f32(p));
1891  return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1892 }
1893 
1894 // Loads four single-precision, floating-point values.
1895 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
1897 {
1898  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1899  // equivalent for neon
1900  return vreinterpretq_m128_f32(vld1q_f32(p));
1901 }
1902 
1903 // Load unaligned 16-bit integer from memory into the first element of dst.
1904 //
1905 // dst[15:0] := MEM[mem_addr+15:mem_addr]
1906 // dst[MAX:16] := 0
1907 //
1908 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
1910 {
1911  return vreinterpretq_m128i_s16(
1912  vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1913 }
1914 
1915 // Load unaligned 64-bit integer from memory into the first element of dst.
1916 //
1917 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1918 // dst[MAX:64] := 0
1919 //
1920 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
1922 {
1923  return vreinterpretq_m128i_s64(
1924  vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1925 }
1926 
1927 // Allocate aligned blocks of memory.
1928 // https://software.intel.com/en-us/
1929 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
1930 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1931 {
1932  void *ptr;
1933  if (align == 1)
1934  return malloc(size);
1935  if (align == 2 || (sizeof(void *) == 8 && align == 4))
1936  align = sizeof(void *);
1937  if (!posix_memalign(&ptr, align, size))
1938  return ptr;
1939  return NULL;
1940 }
1941 
1942 // Conditionally store 8-bit integer elements from a into memory using mask
1943 // (elements are not stored when the highest bit is not set in the corresponding
1944 // element) and a non-temporal memory hint.
1945 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
1947 {
1948  int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1949  __m128 b = _mm_load_ps((const float *) mem_addr);
1950  int8x8_t masked =
1951  vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1952  vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1953  vst1_s8((int8_t *) mem_addr, masked);
1954 }
1955 
1956 // Conditionally store 8-bit integer elements from a into memory using mask
1957 // (elements are not stored when the highest bit is not set in the corresponding
1958 // element) and a non-temporal memory hint.
1959 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1960 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1961 
1962 // Compare packed signed 16-bit integers in a and b, and store packed maximum
1963 // values in dst.
1964 //
1965 // FOR j := 0 to 3
1966 // i := j*16
1967 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1968 // ENDFOR
1969 //
1970 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
1972 {
1973  return vreinterpret_m64_s16(
1975 }
1976 
1977 // Computes the maximums of the four single-precision, floating-point values of
1978 // a and b.
1979 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
1981 {
1982 #if SSE2NEON_PRECISE_MINMAX
1983  float32x4_t _a = vreinterpretq_f32_m128(a);
1984  float32x4_t _b = vreinterpretq_f32_m128(b);
1985  return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1986 #else
1987  return vreinterpretq_m128_f32(
1989 #endif
1990 }
1991 
1992 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1993 // values in dst.
1994 //
1995 // FOR j := 0 to 7
1996 // i := j*8
1997 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
1998 // ENDFOR
1999 //
2000 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
2002 {
2003  return vreinterpret_m64_u8(
2005 }
2006 
2007 // Computes the maximum of the two lower scalar single-precision floating point
2008 // values of a and b.
2009 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2011 {
2012  float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2013  return vreinterpretq_m128_f32(
2014  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2015 }
2016 
2017 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2018 // values in dst.
2019 //
2020 // FOR j := 0 to 3
2021 // i := j*16
2022 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
2023 // ENDFOR
2024 //
2025 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
2027 {
2028  return vreinterpret_m64_s16(
2030 }
2031 
2032 // Computes the minima of the four single-precision, floating-point values of a
2033 // and b.
2034 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2036 {
2037 #if SSE2NEON_PRECISE_MINMAX
2038  float32x4_t _a = vreinterpretq_f32_m128(a);
2039  float32x4_t _b = vreinterpretq_f32_m128(b);
2040  return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2041 #else
2042  return vreinterpretq_m128_f32(
2044 #endif
2045 }
2046 
2047 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2048 // values in dst.
2049 //
2050 // FOR j := 0 to 7
2051 // i := j*8
2052 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
2053 // ENDFOR
2054 //
2055 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
2057 {
2058  return vreinterpret_m64_u8(
2060 }
2061 
2062 // Computes the minimum of the two lower scalar single-precision floating point
2063 // values of a and b.
2064 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2066 {
2067  float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2068  return vreinterpretq_m128_f32(
2069  vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2070 }
2071 
2072 // Sets the low word to the single-precision, floating-point value of b
2073 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
2075 {
2076  return vreinterpretq_m128_f32(
2077  vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2078  vreinterpretq_f32_m128(a), 0));
2079 }
2080 
2081 // Moves the upper two values of B into the lower two values of A.
2082 //
2083 // r3 := a3
2084 // r2 := a2
2085 // r1 := b3
2086 // r0 := b2
2088 {
2089  float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2090  float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2091  return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2092 }
2093 
2094 // Moves the lower two values of B into the upper two values of A.
2095 //
2096 // r3 := b1
2097 // r2 := b0
2098 // r1 := a1
2099 // r0 := a0
2101 {
2102  float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2103  float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2104  return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2105 }
2106 
2107 // Create mask from the most significant bit of each 8-bit element in a, and
2108 // store the result in dst.
2109 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
2111 {
2112  uint8x8_t input = vreinterpret_u8_m64(a);
2113 #if defined(__aarch64__)
2114  static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2115  uint8x8_t tmp = vshr_n_u8(input, 7);
2116  return vaddv_u8(vshl_u8(tmp, shift));
2117 #else
2118  // Refer the implementation of `_mm_movemask_epi8`
2119  uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2120  uint32x2_t paired16 =
2121  vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2122  uint8x8_t paired32 =
2123  vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2124  return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2125 #endif
2126 }
2127 
2128 // NEON does not provide this method
2129 // Creates a 4-bit mask from the most significant bits of the four
2130 // single-precision, floating-point values.
2131 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
2133 {
2134  uint32x4_t input = vreinterpretq_u32_m128(a);
2135 #if defined(__aarch64__)
2136  static const int32x4_t shift = {0, 1, 2, 3};
2137  uint32x4_t tmp = vshrq_n_u32(input, 31);
2138  return vaddvq_u32(vshlq_u32(tmp, shift));
2139 #else
2140  // Uses the exact same method as _mm_movemask_epi8, see that for details.
2141  // Shift out everything but the sign bits with a 32-bit unsigned shift
2142  // right.
2143  uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2144  // Merge the two pairs together with a 64-bit unsigned shift right + add.
2145  uint8x16_t paired =
2146  vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2147  // Extract the result.
2148  return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2149 #endif
2150 }
2151 
2152 // Multiplies the four single-precision, floating-point values of a and b.
2153 //
2154 // r0 := a0 * b0
2155 // r1 := a1 * b1
2156 // r2 := a2 * b2
2157 // r3 := a3 * b3
2158 //
2159 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2161 {
2162  return vreinterpretq_m128_f32(
2164 }
2165 
2166 // Multiply the lower single-precision (32-bit) floating-point element in a and
2167 // b, store the result in the lower element of dst, and copy the upper 3 packed
2168 // elements from a to the upper elements of dst.
2169 //
2170 // dst[31:0] := a[31:0] * b[31:0]
2171 // dst[127:32] := a[127:32]
2172 //
2173 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
2175 {
2176  return _mm_move_ss(a, _mm_mul_ps(a, b));
2177 }
2178 
2179 // Multiply the packed unsigned 16-bit integers in a and b, producing
2180 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2181 // integers in dst.
2182 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
2184 {
2185  return vreinterpret_m64_u16(vshrn_n_u32(
2186  vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2187 }
2188 
2189 // Computes the bitwise OR of the four single-precision, floating-point values
2190 // of a and b.
2191 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
2193 {
2194  return vreinterpretq_m128_s32(
2196 }
2197 
2198 // Average packed unsigned 8-bit integers in a and b, and store the results in
2199 // dst.
2200 //
2201 // FOR j := 0 to 7
2202 // i := j*8
2203 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2204 // ENDFOR
2205 //
2206 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2207 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2208 
2209 // Average packed unsigned 16-bit integers in a and b, and store the results in
2210 // dst.
2211 //
2212 // FOR j := 0 to 3
2213 // i := j*16
2214 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2215 // ENDFOR
2216 //
2217 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2218 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2219 
2220 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2221 // the lower element of dst.
2222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2223 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2224 
2225 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2226 // specified by imm8.
2227 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2228 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2229 
2230 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2231 // values in dst.
2232 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2233 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2234 
2235 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2236 // values in dst.
2237 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2238 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2239 
2240 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2241 // values in dst.
2242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2243 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2244 
2245 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2246 // values in dst.
2247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2248 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2249 
2250 // Create mask from the most significant bit of each 8-bit element in a, and
2251 // store the result in dst.
2252 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2253 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2254 
2255 // Multiply the packed unsigned 16-bit integers in a and b, producing
2256 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2257 // integers in dst.
2258 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2259 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2260 
2261 // Loads one cache line of data from address p to a location closer to the
2262 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
2263 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2264 {
2265  (void) i;
2266  __builtin_prefetch(p);
2267 }
2268 
2269 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2270 // b, then horizontally sum each consecutive 8 differences to produce four
2271 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2272 // 16 bits of dst.
2273 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2274 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2275 
2276 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2277 // in dst.
2278 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2279 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2280 
2281 // Compute the approximate reciprocal of packed single-precision (32-bit)
2282 // floating-point elements in a, and store the results in dst. The maximum
2283 // relative error for this approximation is less than 1.5*2^-12.
2284 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
2286 {
2287  float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2288  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2289 #if SSE2NEON_PRECISE_DIV
2290  // Additional Netwon-Raphson iteration for accuracy
2291  recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2292 #endif
2293  return vreinterpretq_m128_f32(recip);
2294 }
2295 
2296 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2297 // floating-point element in a, store the result in the lower element of dst,
2298 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2299 // maximum relative error for this approximation is less than 1.5*2^-12.
2300 //
2301 // dst[31:0] := (1.0 / a[31:0])
2302 // dst[127:32] := a[127:32]
2303 //
2304 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
2306 {
2307  return _mm_move_ss(a, _mm_rcp_ps(a));
2308 }
2309 
2310 // Computes the approximations of the reciprocal square roots of the four
2311 // single-precision floating point values of in.
2312 // The current precision is 1% error.
2313 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2315 {
2316  float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2317 #if SSE2NEON_PRECISE_SQRT
2318  // Additional Netwon-Raphson iteration for accuracy
2319  out = vmulq_f32(
2320  out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2321  out = vmulq_f32(
2322  out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2323 #endif
2324  return vreinterpretq_m128_f32(out);
2325 }
2326 
2327 // Compute the approximate reciprocal square root of the lower single-precision
2328 // (32-bit) floating-point element in a, store the result in the lower element
2329 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2330 // dst.
2331 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2333 {
2334  return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2335 }
2336 
2337 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2338 // b, then horizontally sum each consecutive 8 differences to produce four
2339 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2340 // 16 bits of dst.
2341 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
2343 {
2344  uint64x1_t t = vpaddl_u32(vpaddl_u16(
2345  vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2346  return vreinterpret_m64_u16(
2347  vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2348 }
2349 
2350 // Macro: Set the flush zero bits of the MXCSR control and status register to
2351 // the value in unsigned 32-bit integer a. The flush zero may contain any of the
2352 // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2353 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
2355 {
2356  // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2357  // regardless of the value of the FZ bit.
2358  union {
2359  fpcr_bitfield field;
2360 #if defined(__aarch64__)
2361  uint64_t value;
2362 #else
2363  uint32_t value;
2364 #endif
2365  } r;
2366 
2367 #if defined(__aarch64__)
2368  asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2369 #else
2370  asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2371 #endif
2372 
2373  r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2374 
2375 #if defined(__aarch64__)
2376  asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2377 #else
2378  asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2379 #endif
2380 }
2381 
2382 // Sets the four single-precision, floating-point values to the four inputs.
2383 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
2384 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2385 {
2386  float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2387  return vreinterpretq_m128_f32(vld1q_f32(data));
2388 }
2389 
2390 // Sets the four single-precision, floating-point values to w.
2391 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2393 {
2394  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2395 }
2396 
2397 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2398 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2399 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2400 // _MM_ROUND_TOWARD_ZERO
2401 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
2403 {
2404  union {
2405  fpcr_bitfield field;
2406 #if defined(__aarch64__)
2407  uint64_t value;
2408 #else
2409  uint32_t value;
2410 #endif
2411  } r;
2412 
2413 #if defined(__aarch64__)
2414  asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2415 #else
2416  asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2417 #endif
2418 
2419  switch (rounding) {
2420  case _MM_ROUND_TOWARD_ZERO:
2421  r.field.bit22 = 1;
2422  r.field.bit23 = 1;
2423  break;
2424  case _MM_ROUND_DOWN:
2425  r.field.bit22 = 0;
2426  r.field.bit23 = 1;
2427  break;
2428  case _MM_ROUND_UP:
2429  r.field.bit22 = 1;
2430  r.field.bit23 = 0;
2431  break;
2432  default: //_MM_ROUND_NEAREST
2433  r.field.bit22 = 0;
2434  r.field.bit23 = 0;
2435  }
2436 
2437 #if defined(__aarch64__)
2438  asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2439 #else
2440  asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2441 #endif
2442 }
2443 
2444 // Copy single-precision (32-bit) floating-point element a to the lower element
2445 // of dst, and zero the upper 3 elements.
2446 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
2448 {
2449  float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2450  return vreinterpretq_m128_f32(vld1q_f32(data));
2451 }
2452 
2453 // Sets the four single-precision, floating-point values to w.
2454 //
2455 // r0 := r1 := r2 := r3 := w
2456 //
2457 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
2459 {
2460  return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2461 }
2462 
2463 // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2464 FORCE_INLINE void _mm_setcsr(unsigned int a)
2465 {
2467 }
2468 
2469 // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2470 FORCE_INLINE unsigned int _mm_getcsr()
2471 {
2472  return _MM_GET_ROUNDING_MODE();
2473 }
2474 
2475 // Sets the four single-precision, floating-point values to the four inputs in
2476 // reverse order.
2477 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
2478 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2479 {
2480  float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2481  return vreinterpretq_m128_f32(vld1q_f32(data));
2482 }
2483 
2484 // Clears the four single-precision, floating-point values.
2485 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
2487 {
2488  return vreinterpretq_m128_f32(vdupq_n_f32(0));
2489 }
2490 
2491 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2492 // in dst.
2493 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2494 #if __has_builtin(__builtin_shufflevector)
2495 #define _mm_shuffle_pi16(a, imm) \
2496  __extension__({ \
2497  vreinterpret_m64_s16(__builtin_shufflevector( \
2498  vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2499  ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2500  })
2501 #else
2502 #define _mm_shuffle_pi16(a, imm) \
2503  __extension__({ \
2504  int16x4_t ret; \
2505  ret = \
2506  vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2507  ret = vset_lane_s16( \
2508  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2509  1); \
2510  ret = vset_lane_s16( \
2511  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2512  2); \
2513  ret = vset_lane_s16( \
2514  vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2515  3); \
2516  vreinterpret_m64_s16(ret); \
2517  })
2518 #endif
2519 
2520 // Guarantees that every preceding store is globally visible before any
2521 // subsequent store.
2522 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
2524 {
2525  __sync_synchronize();
2526 }
2527 
2528 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2529 // int imm)
2530 #if __has_builtin(__builtin_shufflevector)
2531 #define _mm_shuffle_ps(a, b, imm) \
2532  __extension__({ \
2533  float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2534  float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2535  float32x4_t _shuf = __builtin_shufflevector( \
2536  _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2537  (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2538  vreinterpretq_m128_f32(_shuf); \
2539  })
2540 #else // generic
2541 #define _mm_shuffle_ps(a, b, imm) \
2542  __extension__({ \
2543  __m128 ret; \
2544  switch (imm) { \
2545  case _MM_SHUFFLE(1, 0, 3, 2): \
2546  ret = _mm_shuffle_ps_1032((a), (b)); \
2547  break; \
2548  case _MM_SHUFFLE(2, 3, 0, 1): \
2549  ret = _mm_shuffle_ps_2301((a), (b)); \
2550  break; \
2551  case _MM_SHUFFLE(0, 3, 2, 1): \
2552  ret = _mm_shuffle_ps_0321((a), (b)); \
2553  break; \
2554  case _MM_SHUFFLE(2, 1, 0, 3): \
2555  ret = _mm_shuffle_ps_2103((a), (b)); \
2556  break; \
2557  case _MM_SHUFFLE(1, 0, 1, 0): \
2558  ret = _mm_movelh_ps((a), (b)); \
2559  break; \
2560  case _MM_SHUFFLE(1, 0, 0, 1): \
2561  ret = _mm_shuffle_ps_1001((a), (b)); \
2562  break; \
2563  case _MM_SHUFFLE(0, 1, 0, 1): \
2564  ret = _mm_shuffle_ps_0101((a), (b)); \
2565  break; \
2566  case _MM_SHUFFLE(3, 2, 1, 0): \
2567  ret = _mm_shuffle_ps_3210((a), (b)); \
2568  break; \
2569  case _MM_SHUFFLE(0, 0, 1, 1): \
2570  ret = _mm_shuffle_ps_0011((a), (b)); \
2571  break; \
2572  case _MM_SHUFFLE(0, 0, 2, 2): \
2573  ret = _mm_shuffle_ps_0022((a), (b)); \
2574  break; \
2575  case _MM_SHUFFLE(2, 2, 0, 0): \
2576  ret = _mm_shuffle_ps_2200((a), (b)); \
2577  break; \
2578  case _MM_SHUFFLE(3, 2, 0, 2): \
2579  ret = _mm_shuffle_ps_3202((a), (b)); \
2580  break; \
2581  case _MM_SHUFFLE(3, 2, 3, 2): \
2582  ret = _mm_movehl_ps((b), (a)); \
2583  break; \
2584  case _MM_SHUFFLE(1, 1, 3, 3): \
2585  ret = _mm_shuffle_ps_1133((a), (b)); \
2586  break; \
2587  case _MM_SHUFFLE(2, 0, 1, 0): \
2588  ret = _mm_shuffle_ps_2010((a), (b)); \
2589  break; \
2590  case _MM_SHUFFLE(2, 0, 0, 1): \
2591  ret = _mm_shuffle_ps_2001((a), (b)); \
2592  break; \
2593  case _MM_SHUFFLE(2, 0, 3, 2): \
2594  ret = _mm_shuffle_ps_2032((a), (b)); \
2595  break; \
2596  default: \
2597  ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2598  break; \
2599  } \
2600  ret; \
2601  })
2602 #endif
2603 
2604 // Computes the approximations of square roots of the four single-precision,
2605 // floating-point values of a. First computes reciprocal square roots and then
2606 // reciprocals of the four values.
2607 //
2608 // r0 := sqrt(a0)
2609 // r1 := sqrt(a1)
2610 // r2 := sqrt(a2)
2611 // r3 := sqrt(a3)
2612 //
2613 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2615 {
2616 #if SSE2NEON_PRECISE_SQRT
2617  float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2618 
2619  // Test for vrsqrteq_f32(0) -> positive infinity case.
2620  // Change to zero, so that s * 1/sqrt(s) result is zero too.
2621  const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2622  const uint32x4_t div_by_zero =
2623  vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2624  recip = vreinterpretq_f32_u32(
2625  vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2626 
2627  // Additional Netwon-Raphson iteration for accuracy
2628  recip = vmulq_f32(
2629  vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2630  recip);
2631  recip = vmulq_f32(
2632  vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2633  recip);
2634 
2635  // sqrt(s) = s * 1/sqrt(s)
2636  return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2637 #elif defined(__aarch64__)
2638  return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2639 #else
2640  float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2641  float32x4_t sq = vrecpeq_f32(recipsq);
2642  return vreinterpretq_m128_f32(sq);
2643 #endif
2644 }
2645 
2646 // Computes the approximation of the square root of the scalar single-precision
2647 // floating point value of in.
2648 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2650 {
2651  float32_t value =
2652  vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2653  return vreinterpretq_m128_f32(
2654  vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2655 }
2656 
2657 // Stores four single-precision, floating-point values.
2658 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
2660 {
2661  vst1q_f32(p, vreinterpretq_f32_m128(a));
2662 }
2663 
2664 // Store the lower single-precision (32-bit) floating-point element from a into
2665 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2666 // boundary or a general-protection exception may be generated.
2667 //
2668 // MEM[mem_addr+31:mem_addr] := a[31:0]
2669 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2670 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2671 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2672 //
2673 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
2675 {
2676  float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2677  vst1q_f32(p, vdupq_n_f32(a0));
2678 }
2679 
2680 // Stores the lower single - precision, floating - point value.
2681 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
2683 {
2684  vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2685 }
2686 
2687 // Store the lower single-precision (32-bit) floating-point element from a into
2688 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2689 // boundary or a general-protection exception may be generated.
2690 //
2691 // MEM[mem_addr+31:mem_addr] := a[31:0]
2692 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2693 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2694 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2695 //
2696 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2697 #define _mm_store1_ps _mm_store_ps1
2698 
2699 // Stores the upper two single-precision, floating-point values of a to the
2700 // address p.
2701 //
2702 // *p0 := a2
2703 // *p1 := a3
2704 //
2705 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
2707 {
2708  *p = vreinterpret_m64_f32(vget_high_f32(a));
2709 }
2710 
2711 // Stores the lower two single-precision floating point values of a to the
2712 // address p.
2713 //
2714 // *p0 := a0
2715 // *p1 := a1
2716 //
2717 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
2719 {
2720  *p = vreinterpret_m64_f32(vget_low_f32(a));
2721 }
2722 
2723 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2724 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2725 // general-protection exception may be generated.
2726 //
2727 // MEM[mem_addr+31:mem_addr] := a[127:96]
2728 // MEM[mem_addr+63:mem_addr+32] := a[95:64]
2729 // MEM[mem_addr+95:mem_addr+64] := a[63:32]
2730 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2731 //
2732 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
2734 {
2735  float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2736  float32x4_t rev = vextq_f32(tmp, tmp, 2);
2737  vst1q_f32(p, rev);
2738 }
2739 
2740 // Stores four single-precision, floating-point values.
2741 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
2743 {
2744  vst1q_f32(p, vreinterpretq_f32_m128(a));
2745 }
2746 
2747 // Stores 16-bits of integer data a at the address p.
2748 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
2750 {
2751  vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2752 }
2753 
2754 // Stores 64-bits of integer data a at the address p.
2755 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
2757 {
2758  vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2759 }
2760 
2761 // Store 64-bits of integer data from a into memory using a non-temporal memory
2762 // hint.
2763 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
2765 {
2766  vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2767 }
2768 
2769 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2770 // point elements) from a into memory using a non-temporal memory hint.
2771 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
2773 {
2774 #if __has_builtin(__builtin_nontemporal_store)
2775  __builtin_nontemporal_store(a, (float32x4_t *) p);
2776 #else
2777  vst1q_f32(p, vreinterpretq_f32_m128(a));
2778 #endif
2779 }
2780 
2781 // Subtracts the four single-precision, floating-point values of a and b.
2782 //
2783 // r0 := a0 - b0
2784 // r1 := a1 - b1
2785 // r2 := a2 - b2
2786 // r3 := a3 - b3
2787 //
2788 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
2790 {
2791  return vreinterpretq_m128_f32(
2793 }
2794 
2795 // Subtract the lower single-precision (32-bit) floating-point element in b from
2796 // the lower single-precision (32-bit) floating-point element in a, store the
2797 // result in the lower element of dst, and copy the upper 3 packed elements from
2798 // a to the upper elements of dst.
2799 //
2800 // dst[31:0] := a[31:0] - b[31:0]
2801 // dst[127:32] := a[127:32]
2802 //
2803 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
2805 {
2806  return _mm_move_ss(a, _mm_sub_ps(a, b));
2807 }
2808 
2809 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2810 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2811 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2812 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2813 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2814  do { \
2815  float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2816  float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2817  row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2818  vget_low_f32(ROW23.val[0])); \
2819  row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2820  vget_low_f32(ROW23.val[1])); \
2821  row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2822  vget_high_f32(ROW23.val[0])); \
2823  row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2824  vget_high_f32(ROW23.val[1])); \
2825  } while (0)
2826 
2827 // according to the documentation, these intrinsics behave the same as the
2828 // non-'u' versions. We'll just alias them here.
2829 #define _mm_ucomieq_ss _mm_comieq_ss
2830 #define _mm_ucomige_ss _mm_comige_ss
2831 #define _mm_ucomigt_ss _mm_comigt_ss
2832 #define _mm_ucomile_ss _mm_comile_ss
2833 #define _mm_ucomilt_ss _mm_comilt_ss
2834 #define _mm_ucomineq_ss _mm_comineq_ss
2835 
2836 // Return vector of type __m128i with undefined elements.
2837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
2839 {
2840 #if defined(__GNUC__) || defined(__clang__)
2841 #pragma GCC diagnostic push
2842 #pragma GCC diagnostic ignored "-Wuninitialized"
2843 #endif
2844  __m128i a;
2845  return a;
2846 #if defined(__GNUC__) || defined(__clang__)
2847 #pragma GCC diagnostic pop
2848 #endif
2849 }
2850 
2851 // Return vector of type __m128 with undefined elements.
2852 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
2854 {
2855 #if defined(__GNUC__) || defined(__clang__)
2856 #pragma GCC diagnostic push
2857 #pragma GCC diagnostic ignored "-Wuninitialized"
2858 #endif
2859  __m128 a;
2860  return a;
2861 #if defined(__GNUC__) || defined(__clang__)
2862 #pragma GCC diagnostic pop
2863 #endif
2864 }
2865 
2866 // Selects and interleaves the upper two single-precision, floating-point values
2867 // from a and b.
2868 //
2869 // r0 := a2
2870 // r1 := b2
2871 // r2 := a3
2872 // r3 := b3
2873 //
2874 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
2876 {
2877 #if defined(__aarch64__)
2878  return vreinterpretq_m128_f32(
2880 #else
2881  float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2882  float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2883  float32x2x2_t result = vzip_f32(a1, b1);
2884  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2885 #endif
2886 }
2887 
2888 // Selects and interleaves the lower two single-precision, floating-point values
2889 // from a and b.
2890 //
2891 // r0 := a0
2892 // r1 := b0
2893 // r2 := a1
2894 // r3 := b1
2895 //
2896 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
2898 {
2899 #if defined(__aarch64__)
2900  return vreinterpretq_m128_f32(
2902 #else
2903  float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2904  float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2905  float32x2x2_t result = vzip_f32(a1, b1);
2906  return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2907 #endif
2908 }
2909 
2910 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2911 // floating-point values of a and b.
2912 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
2914 {
2915  return vreinterpretq_m128_s32(
2917 }
2918 
2919 /* SSE2 */
2920 
2921 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2922 // unsigned 16-bit integers in b.
2923 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2925 {
2926  return vreinterpretq_m128i_s16(
2928 }
2929 
2930 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2931 // unsigned 32-bit integers in b.
2932 //
2933 // r0 := a0 + b0
2934 // r1 := a1 + b1
2935 // r2 := a2 + b2
2936 // r3 := a3 + b3
2937 //
2938 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2940 {
2941  return vreinterpretq_m128i_s32(
2943 }
2944 
2945 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2946 // unsigned 32-bit integers in b.
2947 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2949 {
2950  return vreinterpretq_m128i_s64(
2952 }
2953 
2954 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2955 // unsigned 8-bit integers in b.
2956 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2958 {
2959  return vreinterpretq_m128i_s8(
2961 }
2962 
2963 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2964 // store the results in dst.
2965 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
2967 {
2968 #if defined(__aarch64__)
2969  return vreinterpretq_m128d_f64(
2970  vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2971 #else
2972  double *da = (double *) &a;
2973  double *db = (double *) &b;
2974  double c[2];
2975  c[0] = da[0] + db[0];
2976  c[1] = da[1] + db[1];
2977  return vld1q_f32((float32_t *) c);
2978 #endif
2979 }
2980 
2981 // Add the lower double-precision (64-bit) floating-point element in a and b,
2982 // store the result in the lower element of dst, and copy the upper element from
2983 // a to the upper element of dst.
2984 //
2985 // dst[63:0] := a[63:0] + b[63:0]
2986 // dst[127:64] := a[127:64]
2987 //
2988 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
2990 {
2991 #if defined(__aarch64__)
2992  return _mm_move_sd(a, _mm_add_pd(a, b));
2993 #else
2994  double *da = (double *) &a;
2995  double *db = (double *) &b;
2996  double c[2];
2997  c[0] = da[0] + db[0];
2998  c[1] = da[1];
2999  return vld1q_f32((float32_t *) c);
3000 #endif
3001 }
3002 
3003 // Add 64-bit integers a and b, and store the result in dst.
3004 //
3005 // dst[63:0] := a[63:0] + b[63:0]
3006 //
3007 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
3009 {
3010  return vreinterpret_m64_s64(
3012 }
3013 
3014 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3015 // and saturates.
3016 //
3017 // r0 := SignedSaturate(a0 + b0)
3018 // r1 := SignedSaturate(a1 + b1)
3019 // ...
3020 // r7 := SignedSaturate(a7 + b7)
3021 //
3022 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
3024 {
3025  return vreinterpretq_m128i_s16(
3027 }
3028 
3029 // Add packed signed 8-bit integers in a and b using saturation, and store the
3030 // results in dst.
3031 //
3032 // FOR j := 0 to 15
3033 // i := j*8
3034 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3035 // ENDFOR
3036 //
3037 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
3039 {
3040  return vreinterpretq_m128i_s8(
3042 }
3043 
3044 // Add packed unsigned 16-bit integers in a and b using saturation, and store
3045 // the results in dst.
3046 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
3048 {
3049  return vreinterpretq_m128i_u16(
3051 }
3052 
3053 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3054 // b and saturates..
3055 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
3057 {
3058  return vreinterpretq_m128i_u8(
3060 }
3061 
3062 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
3063 // elements in a and b, and store the results in dst.
3064 //
3065 // FOR j := 0 to 1
3066 // i := j*64
3067 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
3068 // ENDFOR
3069 //
3070 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
3072 {
3073  return vreinterpretq_m128d_s64(
3075 }
3076 
3077 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
3078 // b.
3079 //
3080 // r := a & b
3081 //
3082 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
3084 {
3085  return vreinterpretq_m128i_s32(
3087 }
3088 
3089 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3090 // elements in a and then AND with b, and store the results in dst.
3091 //
3092 // FOR j := 0 to 1
3093 // i := j*64
3094 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
3095 // ENDFOR
3096 //
3097 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
3099 {
3100  // *NOTE* argument swap
3101  return vreinterpretq_m128d_s64(
3103 }
3104 
3105 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
3106 // 128-bit value in a.
3107 //
3108 // r := (~a) & b
3109 //
3110 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
3112 {
3113  return vreinterpretq_m128i_s32(
3114  vbicq_s32(vreinterpretq_s32_m128i(b),
3115  vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3116 }
3117 
3118 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3119 // unsigned 16-bit integers in b and rounds.
3120 //
3121 // r0 := (a0 + b0) / 2
3122 // r1 := (a1 + b1) / 2
3123 // ...
3124 // r7 := (a7 + b7) / 2
3125 //
3126 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
3128 {
3129  return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3131 }
3132 
3133 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3134 // unsigned 8-bit integers in b and rounds.
3135 //
3136 // r0 := (a0 + b0) / 2
3137 // r1 := (a1 + b1) / 2
3138 // ...
3139 // r15 := (a15 + b15) / 2
3140 //
3141 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
3143 {
3144  return vreinterpretq_m128i_u8(
3146 }
3147 
3148 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3149 // dst.
3150 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3151 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3152 
3153 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3154 // dst.
3155 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3156 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3157 
3158 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3159 // compilation and does not generate any instructions, thus it has zero latency.
3160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
3162 {
3164 }
3165 
3166 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3167 // compilation and does not generate any instructions, thus it has zero latency.
3168 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
3170 {
3172 }
3173 
3174 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3175 // compilation and does not generate any instructions, thus it has zero latency.
3176 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
3178 {
3180 }
3181 
3182 // Applies a type cast to reinterpret four 32-bit floating point values passed
3183 // in as a 128-bit parameter as packed 32-bit integers.
3184 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
3186 {
3188 }
3189 
3190 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3191 // compilation and does not generate any instructions, thus it has zero latency.
3192 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
3194 {
3195 #if defined(__aarch64__)
3196  return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3197 #else
3199 #endif
3200 }
3201 
3202 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3203 // 128-bit parameter as packed 32-bit floating point values.
3204 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
3206 {
3208 }
3209 
3210 // Cache line containing p is flushed and invalidated from all caches in the
3211 // coherency domain. :
3212 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
3213 FORCE_INLINE void _mm_clflush(void const *p)
3214 {
3215  (void) p;
3216  // no corollary for Neon?
3217 }
3218 
3219 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3220 // unsigned 16-bit integers in b for equality.
3221 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
3223 {
3224  return vreinterpretq_m128i_u16(
3226 }
3227 
3228 // Compare packed 32-bit integers in a and b for equality, and store the results
3229 // in dst
3231 {
3232  return vreinterpretq_m128i_u32(
3234 }
3235 
3236 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3237 // unsigned 8-bit integers in b for equality.
3238 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
3240 {
3241  return vreinterpretq_m128i_u8(
3243 }
3244 
3245 // Compare packed double-precision (64-bit) floating-point elements in a and b
3246 // for equality, and store the results in dst.
3247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
3249 {
3250 #if defined(__aarch64__)
3251  return vreinterpretq_m128d_u64(
3252  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3253 #else
3254  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3255  uint32x4_t cmp =
3257  uint32x4_t swapped = vrev64q_u32(cmp);
3258  return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3259 #endif
3260 }
3261 
3262 // Compare the lower double-precision (64-bit) floating-point elements in a and
3263 // b for equality, store the result in the lower element of dst, and copy the
3264 // upper element from a to the upper element of dst.
3265 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
3267 {
3268  return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3269 }
3270 
3271 // Compare packed double-precision (64-bit) floating-point elements in a and b
3272 // for greater-than-or-equal, and store the results in dst.
3273 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
3275 {
3276 #if defined(__aarch64__)
3277  return vreinterpretq_m128d_u64(
3278  vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3279 #else
3280  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3281  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3282  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3283  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3284  uint64_t d[2];
3285  d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3286  d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3287 
3288  return vreinterpretq_m128d_u64(vld1q_u64(d));
3289 #endif
3290 }
3291 
3292 // Compare the lower double-precision (64-bit) floating-point elements in a and
3293 // b for greater-than-or-equal, store the result in the lower element of dst,
3294 // and copy the upper element from a to the upper element of dst.
3295 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
3297 {
3298 #if defined(__aarch64__)
3299  return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3300 #else
3301  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3302  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3303  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3304  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3305  uint64_t d[2];
3306  d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3307  d[1] = a1;
3308 
3309  return vreinterpretq_m128d_u64(vld1q_u64(d));
3310 #endif
3311 }
3312 
3313 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3314 // in b for greater than.
3315 //
3316 // r0 := (a0 > b0) ? 0xffff : 0x0
3317 // r1 := (a1 > b1) ? 0xffff : 0x0
3318 // ...
3319 // r7 := (a7 > b7) ? 0xffff : 0x0
3320 //
3321 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
3323 {
3324  return vreinterpretq_m128i_u16(
3326 }
3327 
3328 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3329 // in b for greater than.
3330 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
3332 {
3333  return vreinterpretq_m128i_u32(
3335 }
3336 
3337 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3338 // in b for greater than.
3339 //
3340 // r0 := (a0 > b0) ? 0xff : 0x0
3341 // r1 := (a1 > b1) ? 0xff : 0x0
3342 // ...
3343 // r15 := (a15 > b15) ? 0xff : 0x0
3344 //
3345 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
3347 {
3348  return vreinterpretq_m128i_u8(
3350 }
3351 
3352 // Compare packed double-precision (64-bit) floating-point elements in a and b
3353 // for greater-than, and store the results in dst.
3354 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
3356 {
3357 #if defined(__aarch64__)
3358  return vreinterpretq_m128d_u64(
3359  vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3360 #else
3361  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3362  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3363  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3364  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3365  uint64_t d[2];
3366  d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3367  d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3368 
3369  return vreinterpretq_m128d_u64(vld1q_u64(d));
3370 #endif
3371 }
3372 
3373 // Compare the lower double-precision (64-bit) floating-point elements in a and
3374 // b for greater-than, store the result in the lower element of dst, and copy
3375 // the upper element from a to the upper element of dst.
3376 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
3378 {
3379 #if defined(__aarch64__)
3380  return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3381 #else
3382  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3383  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3384  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3385  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3386  uint64_t d[2];
3387  d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3388  d[1] = a1;
3389 
3390  return vreinterpretq_m128d_u64(vld1q_u64(d));
3391 #endif
3392 }
3393 
3394 // Compare packed double-precision (64-bit) floating-point elements in a and b
3395 // for less-than-or-equal, and store the results in dst.
3396 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
3398 {
3399 #if defined(__aarch64__)
3400  return vreinterpretq_m128d_u64(
3401  vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3402 #else
3403  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3404  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3405  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3406  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3407  uint64_t d[2];
3408  d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3409  d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3410 
3411  return vreinterpretq_m128d_u64(vld1q_u64(d));
3412 #endif
3413 }
3414 
3415 // Compare the lower double-precision (64-bit) floating-point elements in a and
3416 // b for less-than-or-equal, store the result in the lower element of dst, and
3417 // copy the upper element from a to the upper element of dst.
3418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
3420 {
3421 #if defined(__aarch64__)
3422  return _mm_move_sd(a, _mm_cmple_pd(a, b));
3423 #else
3424  // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3425  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3426  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3427  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3428  uint64_t d[2];
3429  d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3430  d[1] = a1;
3431 
3432  return vreinterpretq_m128d_u64(vld1q_u64(d));
3433 #endif
3434 }
3435 
3436 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3437 // in b for less than.
3438 //
3439 // r0 := (a0 < b0) ? 0xffff : 0x0
3440 // r1 := (a1 < b1) ? 0xffff : 0x0
3441 // ...
3442 // r7 := (a7 < b7) ? 0xffff : 0x0
3443 //
3444 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
3446 {
3447  return vreinterpretq_m128i_u16(
3449 }
3450 
3451 
3452 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3453 // in b for less than.
3454 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
3456 {
3457  return vreinterpretq_m128i_u32(
3459 }
3460 
3461 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3462 // in b for lesser than.
3463 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
3465 {
3466  return vreinterpretq_m128i_u8(
3468 }
3469 
3470 // Compare packed double-precision (64-bit) floating-point elements in a and b
3471 // for less-than, and store the results in dst.
3472 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
3474 {
3475 #if defined(__aarch64__)
3476  return vreinterpretq_m128d_u64(
3477  vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3478 #else
3479  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3480  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3481  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3482  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3483  uint64_t d[2];
3484  d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3485  d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3486 
3487  return vreinterpretq_m128d_u64(vld1q_u64(d));
3488 #endif
3489 }
3490 
3491 // Compare the lower double-precision (64-bit) floating-point elements in a and
3492 // b for less-than, store the result in the lower element of dst, and copy the
3493 // upper element from a to the upper element of dst.
3494 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
3496 {
3497 #if defined(__aarch64__)
3498  return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3499 #else
3500  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3501  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3502  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3503  uint64_t d[2];
3504  d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3505  d[1] = a1;
3506 
3507  return vreinterpretq_m128d_u64(vld1q_u64(d));
3508 #endif
3509 }
3510 
3511 // Compare packed double-precision (64-bit) floating-point elements in a and b
3512 // for not-equal, and store the results in dst.
3513 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
3515 {
3516 #if defined(__aarch64__)
3517  return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3518  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3519 #else
3520  // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3521  uint32x4_t cmp =
3523  uint32x4_t swapped = vrev64q_u32(cmp);
3524  return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3525 #endif
3526 }
3527 
3528 // Compare the lower double-precision (64-bit) floating-point elements in a and
3529 // b for not-equal, store the result in the lower element of dst, and copy the
3530 // upper element from a to the upper element of dst.
3531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
3533 {
3534  return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3535 }
3536 
3537 // Compare packed double-precision (64-bit) floating-point elements in a and b
3538 // for not-greater-than-or-equal, and store the results in dst.
3539 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3541 {
3542 #if defined(__aarch64__)
3543  return vreinterpretq_m128d_u64(veorq_u64(
3544  vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3545  vdupq_n_u64(UINT64_MAX)));
3546 #else
3547  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3548  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3549  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3550  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3551  uint64_t d[2];
3552  d[0] =
3553  !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3554  d[1] =
3555  !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3556 
3557  return vreinterpretq_m128d_u64(vld1q_u64(d));
3558 #endif
3559 }
3560 
3561 // Compare the lower double-precision (64-bit) floating-point elements in a and
3562 // b for not-greater-than-or-equal, store the result in the lower element of
3563 // dst, and copy the upper element from a to the upper element of dst.
3564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3566 {
3567  return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3568 }
3569 
3570 // Compare packed double-precision (64-bit) floating-point elements in a and b
3571 // for not-greater-than, and store the results in dst.
3572 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3574 {
3575 #if defined(__aarch64__)
3576  return vreinterpretq_m128d_u64(veorq_u64(
3577  vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3578  vdupq_n_u64(UINT64_MAX)));
3579 #else
3580  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3581  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3582  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3583  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3584  uint64_t d[2];
3585  d[0] =
3586  !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3587  d[1] =
3588  !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3589 
3590  return vreinterpretq_m128d_u64(vld1q_u64(d));
3591 #endif
3592 }
3593 
3594 // Compare the lower double-precision (64-bit) floating-point elements in a and
3595 // b for not-greater-than, store the result in the lower element of dst, and
3596 // copy the upper element from a to the upper element of dst.
3597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3599 {
3600  return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3601 }
3602 
3603 // Compare packed double-precision (64-bit) floating-point elements in a and b
3604 // for not-less-than-or-equal, and store the results in dst.
3605 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3607 {
3608 #if defined(__aarch64__)
3609  return vreinterpretq_m128d_u64(veorq_u64(
3610  vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3611  vdupq_n_u64(UINT64_MAX)));
3612 #else
3613  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3614  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3615  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3616  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3617  uint64_t d[2];
3618  d[0] =
3619  !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3620  d[1] =
3621  !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3622 
3623  return vreinterpretq_m128d_u64(vld1q_u64(d));
3624 #endif
3625 }
3626 
3627 // Compare the lower double-precision (64-bit) floating-point elements in a and
3628 // b for not-less-than-or-equal, store the result in the lower element of dst,
3629 // and copy the upper element from a to the upper element of dst.
3630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3632 {
3633  return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3634 }
3635 
3636 // Compare packed double-precision (64-bit) floating-point elements in a and b
3637 // for not-less-than, and store the results in dst.
3638 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3640 {
3641 #if defined(__aarch64__)
3642  return vreinterpretq_m128d_u64(veorq_u64(
3643  vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3644  vdupq_n_u64(UINT64_MAX)));
3645 #else
3646  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3647  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3648  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3649  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3650  uint64_t d[2];
3651  d[0] =
3652  !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3653  d[1] =
3654  !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3655 
3656  return vreinterpretq_m128d_u64(vld1q_u64(d));
3657 #endif
3658 }
3659 
3660 // Compare the lower double-precision (64-bit) floating-point elements in a and
3661 // b for not-less-than, store the result in the lower element of dst, and copy
3662 // the upper element from a to the upper element of dst.
3663 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3665 {
3666  return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3667 }
3668 
3669 // Compare packed double-precision (64-bit) floating-point elements in a and b
3670 // to see if neither is NaN, and store the results in dst.
3671 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
3673 {
3674 #if defined(__aarch64__)
3675  // Excluding NaNs, any two floating point numbers can be compared.
3676  uint64x2_t not_nan_a =
3677  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3678  uint64x2_t not_nan_b =
3679  vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3680  return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3681 #else
3682  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3683  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3684  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3685  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3686  uint64_t d[2];
3687  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3688  (*(double *) &b0) == (*(double *) &b0))
3689  ? ~UINT64_C(0)
3690  : UINT64_C(0);
3691  d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3692  (*(double *) &b1) == (*(double *) &b1))
3693  ? ~UINT64_C(0)
3694  : UINT64_C(0);
3695 
3696  return vreinterpretq_m128d_u64(vld1q_u64(d));
3697 #endif
3698 }
3699 
3700 // Compare the lower double-precision (64-bit) floating-point elements in a and
3701 // b to see if neither is NaN, store the result in the lower element of dst, and
3702 // copy the upper element from a to the upper element of dst.
3703 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
3705 {
3706 #if defined(__aarch64__)
3707  return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3708 #else
3709  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3710  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3711  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3712  uint64_t d[2];
3713  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3714  (*(double *) &b0) == (*(double *) &b0))
3715  ? ~UINT64_C(0)
3716  : UINT64_C(0);
3717  d[1] = a1;
3718 
3719  return vreinterpretq_m128d_u64(vld1q_u64(d));
3720 #endif
3721 }
3722 
3723 // Compare packed double-precision (64-bit) floating-point elements in a and b
3724 // to see if either is NaN, and store the results in dst.
3725 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
3727 {
3728 #if defined(__aarch64__)
3729  // Two NaNs are not equal in comparison operation.
3730  uint64x2_t not_nan_a =
3731  vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3732  uint64x2_t not_nan_b =
3733  vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3734  return vreinterpretq_m128d_s32(
3735  vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3736 #else
3737  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3738  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3739  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3740  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3741  uint64_t d[2];
3742  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3743  (*(double *) &b0) == (*(double *) &b0))
3744  ? UINT64_C(0)
3745  : ~UINT64_C(0);
3746  d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3747  (*(double *) &b1) == (*(double *) &b1))
3748  ? UINT64_C(0)
3749  : ~UINT64_C(0);
3750 
3751  return vreinterpretq_m128d_u64(vld1q_u64(d));
3752 #endif
3753 }
3754 
3755 // Compare the lower double-precision (64-bit) floating-point elements in a and
3756 // b to see if either is NaN, store the result in the lower element of dst, and
3757 // copy the upper element from a to the upper element of dst.
3758 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
3760 {
3761 #if defined(__aarch64__)
3762  return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3763 #else
3764  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3765  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3766  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3767  uint64_t d[2];
3768  d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3769  (*(double *) &b0) == (*(double *) &b0))
3770  ? UINT64_C(0)
3771  : ~UINT64_C(0);
3772  d[1] = a1;
3773 
3774  return vreinterpretq_m128d_u64(vld1q_u64(d));
3775 #endif
3776 }
3777 
3778 // Compare the lower double-precision (64-bit) floating-point element in a and b
3779 // for greater-than-or-equal, and return the boolean result (0 or 1).
3780 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
3782 {
3783 #if defined(__aarch64__)
3784  return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3785 #else
3786  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3787  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3788 
3789  return (*(double *) &a0 >= *(double *) &b0);
3790 #endif
3791 }
3792 
3793 // Compare the lower double-precision (64-bit) floating-point element in a and b
3794 // for greater-than, and return the boolean result (0 or 1).
3795 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
3797 {
3798 #if defined(__aarch64__)
3799  return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3800 #else
3801  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3802  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3803 
3804  return (*(double *) &a0 > *(double *) &b0);
3805 #endif
3806 }
3807 
3808 // Compare the lower double-precision (64-bit) floating-point element in a and b
3809 // for less-than-or-equal, and return the boolean result (0 or 1).
3810 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
3812 {
3813 #if defined(__aarch64__)
3814  return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3815 #else
3816  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3817  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3818 
3819  return (*(double *) &a0 <= *(double *) &b0);
3820 #endif
3821 }
3822 
3823 // Compare the lower double-precision (64-bit) floating-point element in a and b
3824 // for less-than, and return the boolean result (0 or 1).
3825 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
3827 {
3828 #if defined(__aarch64__)
3829  return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3830 #else
3831  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3832  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3833 
3834  return (*(double *) &a0 < *(double *) &b0);
3835 #endif
3836 }
3837 
3838 // Compare the lower double-precision (64-bit) floating-point element in a and b
3839 // for equality, and return the boolean result (0 or 1).
3840 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
3842 {
3843 #if defined(__aarch64__)
3844  return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3845 #else
3846  uint32x4_t a_not_nan =
3848  uint32x4_t b_not_nan =
3850  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3851  uint32x4_t a_eq_b =
3853  uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3854  vreinterpretq_u64_u32(a_eq_b));
3855  return vgetq_lane_u64(and_results, 0) & 0x1;
3856 #endif
3857 }
3858 
3859 // Compare the lower double-precision (64-bit) floating-point element in a and b
3860 // for not-equal, and return the boolean result (0 or 1).
3861 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
3863 {
3864  return !_mm_comieq_sd(a, b);
3865 }
3866 
3867 // Convert packed signed 32-bit integers in a to packed double-precision
3868 // (64-bit) floating-point elements, and store the results in dst.
3869 //
3870 // FOR j := 0 to 1
3871 // i := j*32
3872 // m := j*64
3873 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3874 // ENDFOR
3875 //
3876 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
3878 {
3879 #if defined(__aarch64__)
3880  return vreinterpretq_m128d_f64(
3881  vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3882 #else
3883  double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3884  double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3885  return _mm_set_pd(a1, a0);
3886 #endif
3887 }
3888 
3889 // Converts the four signed 32-bit integer values of a to single-precision,
3890 // floating-point values
3891 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3893 {
3894  return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3895 }
3896 
3897 // Convert packed double-precision (64-bit) floating-point elements in a to
3898 // packed 32-bit integers, and store the results in dst.
3899 //
3900 // FOR j := 0 to 1
3901 // i := 32*j
3902 // k := 64*j
3903 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3904 // ENDFOR
3905 //
3906 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
3908 {
3910  double d0 = ((double *) &rnd)[0];
3911  double d1 = ((double *) &rnd)[1];
3912  return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3913 }
3914 
3915 // Convert packed double-precision (64-bit) floating-point elements in a to
3916 // packed 32-bit integers, and store the results in dst.
3917 //
3918 // FOR j := 0 to 1
3919 // i := 32*j
3920 // k := 64*j
3921 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3922 // ENDFOR
3923 //
3924 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
3926 {
3928  double d0 = ((double *) &rnd)[0];
3929  double d1 = ((double *) &rnd)[1];
3930  int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3931  return vreinterpret_m64_s32(vld1_s32(data));
3932 }
3933 
3934 // Convert packed double-precision (64-bit) floating-point elements in a to
3935 // packed single-precision (32-bit) floating-point elements, and store the
3936 // results in dst.
3937 //
3938 // FOR j := 0 to 1
3939 // i := 32*j
3940 // k := 64*j
3941 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3942 // ENDFOR
3943 // dst[127:64] := 0
3944 //
3945 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
3947 {
3948 #if defined(__aarch64__)
3949  float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3950  return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3951 #else
3952  float a0 = (float) ((double *) &a)[0];
3953  float a1 = (float) ((double *) &a)[1];
3954  return _mm_set_ps(0, 0, a1, a0);
3955 #endif
3956 }
3957 
3958 // Convert packed signed 32-bit integers in a to packed double-precision
3959 // (64-bit) floating-point elements, and store the results in dst.
3960 //
3961 // FOR j := 0 to 1
3962 // i := j*32
3963 // m := j*64
3964 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3965 // ENDFOR
3966 //
3967 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
3969 {
3970 #if defined(__aarch64__)
3971  return vreinterpretq_m128d_f64(
3972  vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3973 #else
3974  double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3975  double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3976  return _mm_set_pd(a1, a0);
3977 #endif
3978 }
3979 
3980 // Converts the four single-precision, floating-point values of a to signed
3981 // 32-bit integer values.
3982 //
3983 // r0 := (int) a0
3984 // r1 := (int) a1
3985 // r2 := (int) a2
3986 // r3 := (int) a3
3987 //
3988 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3989 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3990 // does not support! It is supported on ARMv8-A however.
3992 {
3993 #if defined(__aarch64__)
3994  switch (_MM_GET_ROUNDING_MODE()) {
3995  case _MM_ROUND_NEAREST:
3996  return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3997  case _MM_ROUND_DOWN:
3998  return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3999  case _MM_ROUND_UP:
4000  return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
4001  default: // _MM_ROUND_TOWARD_ZERO
4002  return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
4003  }
4004 #else
4005  float *f = (float *) &a;
4006  switch (_MM_GET_ROUNDING_MODE()) {
4007  case _MM_ROUND_NEAREST: {
4008  uint32x4_t signmask = vdupq_n_u32(0x80000000);
4009  float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4010  vdupq_n_f32(0.5f)); /* +/- 0.5 */
4011  int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4012  vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4013  int32x4_t r_trunc = vcvtq_s32_f32(
4014  vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4015  int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4016  vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4017  int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4018  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4019  float32x4_t delta = vsubq_f32(
4021  vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4022  uint32x4_t is_delta_half =
4023  vceqq_f32(delta, half); /* delta == +/- 0.5 */
4024  return vreinterpretq_m128i_s32(
4025  vbslq_s32(is_delta_half, r_even, r_normal));
4026  }
4027  case _MM_ROUND_DOWN:
4028  return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
4029  floorf(f[0]));
4030  case _MM_ROUND_UP:
4031  return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
4032  ceilf(f[0]));
4033  default: // _MM_ROUND_TOWARD_ZERO
4034  return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
4035  (int32_t) f[0]);
4036  }
4037 #endif
4038 }
4039 
4040 // Convert packed single-precision (32-bit) floating-point elements in a to
4041 // packed double-precision (64-bit) floating-point elements, and store the
4042 // results in dst.
4043 //
4044 // FOR j := 0 to 1
4045 // i := 64*j
4046 // k := 32*j
4047 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4048 // ENDFOR
4049 //
4050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
4052 {
4053 #if defined(__aarch64__)
4054  return vreinterpretq_m128d_f64(
4055  vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4056 #else
4057  double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4058  double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4059  return _mm_set_pd(a1, a0);
4060 #endif
4061 }
4062 
4063 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4064 //
4065 // dst[63:0] := a[63:0]
4066 //
4067 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
4069 {
4070 #if defined(__aarch64__)
4071  return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4072 #else
4073  return ((double *) &a)[0];
4074 #endif
4075 }
4076 
4077 // Convert the lower double-precision (64-bit) floating-point element in a to a
4078 // 32-bit integer, and store the result in dst.
4079 //
4080 // dst[31:0] := Convert_FP64_To_Int32(a[63:0])
4081 //
4082 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
4084 {
4085 #if defined(__aarch64__)
4086  return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4087 #else
4089  double ret = ((double *) &rnd)[0];
4090  return (int32_t) ret;
4091 #endif
4092 }
4093 
4094 // Convert the lower double-precision (64-bit) floating-point element in a to a
4095 // 64-bit integer, and store the result in dst.
4096 //
4097 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4098 //
4099 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
4101 {
4102 #if defined(__aarch64__)
4103  return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
4104 #else
4106  double ret = ((double *) &rnd)[0];
4107  return (int64_t) ret;
4108 #endif
4109 }
4110 
4111 // Convert the lower double-precision (64-bit) floating-point element in a to a
4112 // 64-bit integer, and store the result in dst.
4113 //
4114 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
4115 //
4116 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
4117 #define _mm_cvtsd_si64x _mm_cvtsd_si64
4118 
4119 // Convert the lower double-precision (64-bit) floating-point element in b to a
4120 // single-precision (32-bit) floating-point element, store the result in the
4121 // lower element of dst, and copy the upper 3 packed elements from a to the
4122 // upper elements of dst.
4123 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
4125 {
4126 #if defined(__aarch64__)
4127  return vreinterpretq_m128_f32(vsetq_lane_f32(
4128  vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4129  vreinterpretq_f32_m128(a), 0));
4130 #else
4131  return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4132  vreinterpretq_f32_m128(a), 0));
4133 #endif
4134 }
4135 
4136 // Copy the lower 32-bit integer in a to dst.
4137 //
4138 // dst[31:0] := a[31:0]
4139 //
4140 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
4142 {
4143  return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4144 }
4145 
4146 // Copy the lower 64-bit integer in a to dst.
4147 //
4148 // dst[63:0] := a[63:0]
4149 //
4150 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
4152 {
4153  return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4154 }
4155 
4156 // Copy the lower 64-bit integer in a to dst.
4157 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4158 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4159 
4160 // Convert the signed 32-bit integer b to a double-precision (64-bit)
4161 // floating-point element, store the result in the lower element of dst, and
4162 // copy the upper element from a to the upper element of dst.
4163 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
4165 {
4166 #if defined(__aarch64__)
4167  return vreinterpretq_m128d_f64(
4168  vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4169 #else
4170  double bf = (double) b;
4171  return vreinterpretq_m128d_s64(
4172  vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4173 #endif
4174 }
4175 
4176 // Copy the lower 64-bit integer in a to dst.
4177 //
4178 // dst[63:0] := a[63:0]
4179 //
4180 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4181 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4182 
4183 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4184 // zero extending the upper bits.
4185 //
4186 // r0 := a
4187 // r1 := 0x0
4188 // r2 := 0x0
4189 // r3 := 0x0
4190 //
4191 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
4193 {
4194  return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4195 }
4196 
4197 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4198 // floating-point element, store the result in the lower element of dst, and
4199 // copy the upper element from a to the upper element of dst.
4200 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
4202 {
4203 #if defined(__aarch64__)
4204  return vreinterpretq_m128d_f64(
4205  vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4206 #else
4207  double bf = (double) b;
4208  return vreinterpretq_m128d_s64(
4209  vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4210 #endif
4211 }
4212 
4213 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4214 // zero extending the upper bits.
4215 //
4216 // r0 := a
4217 // r1 := 0x0
4219 {
4220  return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4221 }
4222 
4223 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4224 // element.
4225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4226 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4227 
4228 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4229 // floating-point element, store the result in the lower element of dst, and
4230 // copy the upper element from a to the upper element of dst.
4231 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4232 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4233 
4234 // Convert the lower single-precision (32-bit) floating-point element in b to a
4235 // double-precision (64-bit) floating-point element, store the result in the
4236 // lower element of dst, and copy the upper element from a to the upper element
4237 // of dst.
4238 //
4239 // dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4240 // dst[127:64] := a[127:64]
4241 //
4242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
4244 {
4245  double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4246 #if defined(__aarch64__)
4247  return vreinterpretq_m128d_f64(
4248  vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4249 #else
4250  return vreinterpretq_m128d_s64(
4251  vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4252 #endif
4253 }
4254 
4255 // Convert packed double-precision (64-bit) floating-point elements in a to
4256 // packed 32-bit integers with truncation, and store the results in dst.
4257 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
4259 {
4260  double a0 = ((double *) &a)[0];
4261  double a1 = ((double *) &a)[1];
4262  return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4263 }
4264 
4265 // Convert packed double-precision (64-bit) floating-point elements in a to
4266 // packed 32-bit integers with truncation, and store the results in dst.
4267 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
4269 {
4270  double a0 = ((double *) &a)[0];
4271  double a1 = ((double *) &a)[1];
4272  int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4273  return vreinterpret_m64_s32(vld1_s32(data));
4274 }
4275 
4276 // Converts the four single-precision, floating-point values of a to signed
4277 // 32-bit integer values using truncate.
4278 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
4280 {
4281  return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4282 }
4283 
4284 // Convert the lower double-precision (64-bit) floating-point element in a to a
4285 // 32-bit integer with truncation, and store the result in dst.
4286 //
4287 // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4288 //
4289 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
4291 {
4292  double ret = *((double *) &a);
4293  return (int32_t) ret;
4294 }
4295 
4296 // Convert the lower double-precision (64-bit) floating-point element in a to a
4297 // 64-bit integer with truncation, and store the result in dst.
4298 //
4299 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4300 //
4301 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
4303 {
4304 #if defined(__aarch64__)
4305  return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4306 #else
4307  double ret = *((double *) &a);
4308  return (int64_t) ret;
4309 #endif
4310 }
4311 
4312 // Convert the lower double-precision (64-bit) floating-point element in a to a
4313 // 64-bit integer with truncation, and store the result in dst.
4314 //
4315 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4316 //
4317 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4318 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4319 
4320 // Divide packed double-precision (64-bit) floating-point elements in a by
4321 // packed elements in b, and store the results in dst.
4322 //
4323 // FOR j := 0 to 1
4324 // i := 64*j
4325 // dst[i+63:i] := a[i+63:i] / b[i+63:i]
4326 // ENDFOR
4327 //
4328 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
4330 {
4331 #if defined(__aarch64__)
4332  return vreinterpretq_m128d_f64(
4333  vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4334 #else
4335  double *da = (double *) &a;
4336  double *db = (double *) &b;
4337  double c[2];
4338  c[0] = da[0] / db[0];
4339  c[1] = da[1] / db[1];
4340  return vld1q_f32((float32_t *) c);
4341 #endif
4342 }
4343 
4344 // Divide the lower double-precision (64-bit) floating-point element in a by the
4345 // lower double-precision (64-bit) floating-point element in b, store the result
4346 // in the lower element of dst, and copy the upper element from a to the upper
4347 // element of dst.
4348 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
4350 {
4351 #if defined(__aarch64__)
4352  float64x2_t tmp =
4353  vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4354  return vreinterpretq_m128d_f64(
4355  vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4356 #else
4357  return _mm_move_sd(a, _mm_div_pd(a, b));
4358 #endif
4359 }
4360 
4361 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4362 // extends.
4363 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4364 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4365 #define _mm_extract_epi16(a, imm) \
4366  vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4367 
4368 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4369 // of a.
4370 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4371 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4372 // __constrange(0,8) int imm)
4373 #define _mm_insert_epi16(a, b, imm) \
4374  __extension__({ \
4375  vreinterpretq_m128i_s16( \
4376  vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4377  })
4378 
4379 // Loads two double-precision from 16-byte aligned memory, floating-point
4380 // values.
4381 //
4382 // dst[127:0] := MEM[mem_addr+127:mem_addr]
4383 //
4384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
4386 {
4387 #if defined(__aarch64__)
4388  return vreinterpretq_m128d_f64(vld1q_f64(p));
4389 #else
4390  const float *fp = (const float *) p;
4391  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4392  return vreinterpretq_m128d_f32(vld1q_f32(data));
4393 #endif
4394 }
4395 
4396 // Load a double-precision (64-bit) floating-point element from memory into both
4397 // elements of dst.
4398 //
4399 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4400 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4401 //
4402 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4403 #define _mm_load_pd1 _mm_load1_pd
4404 
4405 // Load a double-precision (64-bit) floating-point element from memory into the
4406 // lower of dst, and zero the upper element. mem_addr does not need to be
4407 // aligned on any particular boundary.
4408 //
4409 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4410 // dst[127:64] := 0
4411 //
4412 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
4414 {
4415 #if defined(__aarch64__)
4416  return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4417 #else
4418  const float *fp = (const float *) p;
4419  float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4420  return vreinterpretq_m128d_f32(vld1q_f32(data));
4421 #endif
4422 }
4423 
4424 // Loads 128-bit value. :
4425 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
4427 {
4428  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4429 }
4430 
4431 // Load a double-precision (64-bit) floating-point element from memory into both
4432 // elements of dst.
4433 //
4434 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4435 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4436 //
4437 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
4439 {
4440 #if defined(__aarch64__)
4441  return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4442 #else
4443  return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4444 #endif
4445 }
4446 
4447 // Load a double-precision (64-bit) floating-point element from memory into the
4448 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4449 // not need to be aligned on any particular boundary.
4450 //
4451 // dst[63:0] := a[63:0]
4452 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4453 //
4454 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
4456 {
4457 #if defined(__aarch64__)
4458  return vreinterpretq_m128d_f64(
4459  vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4460 #else
4461  return vreinterpretq_m128d_f32(vcombine_f32(
4462  vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4463 #endif
4464 }
4465 
4466 // Load 64-bit integer from memory into the first element of dst.
4467 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
4469 {
4470  /* Load the lower 64 bits of the value pointed to by p into the
4471  * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4472  */
4473  return vreinterpretq_m128i_s32(
4474  vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4475 }
4476 
4477 // Load a double-precision (64-bit) floating-point element from memory into the
4478 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4479 // not need to be aligned on any particular boundary.
4480 //
4481 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4482 // dst[127:64] := a[127:64]
4483 //
4484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
4486 {
4487 #if defined(__aarch64__)
4488  return vreinterpretq_m128d_f64(
4489  vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4490 #else
4491  return vreinterpretq_m128d_f32(
4492  vcombine_f32(vld1_f32((const float *) p),
4493  vget_high_f32(vreinterpretq_f32_m128d(a))));
4494 #endif
4495 }
4496 
4497 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4498 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4499 // general-protection exception may be generated.
4500 //
4501 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4502 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4503 //
4504 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
4506 {
4507 #if defined(__aarch64__)
4508  float64x2_t v = vld1q_f64(p);
4509  return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4510 #else
4511  int64x2_t v = vld1q_s64((const int64_t *) p);
4512  return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4513 #endif
4514 }
4515 
4516 // Loads two double-precision from unaligned memory, floating-point values.
4517 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
4519 {
4520  return _mm_load_pd(p);
4521 }
4522 
4523 // Loads 128-bit value. :
4524 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
4526 {
4527  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4528 }
4529 
4530 // Load unaligned 32-bit integer from memory into the first element of dst.
4531 //
4532 // dst[31:0] := MEM[mem_addr+31:mem_addr]
4533 // dst[MAX:32] := 0
4534 //
4535 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
4537 {
4538  return vreinterpretq_m128i_s32(
4539  vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4540 }
4541 
4542 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4543 // integers from b.
4544 //
4545 // r0 := (a0 * b0) + (a1 * b1)
4546 // r1 := (a2 * b2) + (a3 * b3)
4547 // r2 := (a4 * b4) + (a5 * b5)
4548 // r3 := (a6 * b6) + (a7 * b7)
4549 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
4551 {
4552  int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4553  vget_low_s16(vreinterpretq_s16_m128i(b)));
4554  int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4555  vget_high_s16(vreinterpretq_s16_m128i(b)));
4556 
4557  int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4558  int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4559 
4560  return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4561 }
4562 
4563 // Conditionally store 8-bit integer elements from a into memory using mask
4564 // (elements are not stored when the highest bit is not set in the corresponding
4565 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4566 // on any particular boundary.
4567 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
4569 {
4570  int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4571  __m128 b = _mm_load_ps((const float *) mem_addr);
4572  int8x16_t masked =
4573  vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4575  vst1q_s8((int8_t *) mem_addr, masked);
4576 }
4577 
4578 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4579 // signed 16-bit integers from b.
4580 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
4582 {
4583  return vreinterpretq_m128i_s16(
4585 }
4586 
4587 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4588 // 16 unsigned 8-bit integers from b.
4589 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
4591 {
4592  return vreinterpretq_m128i_u8(
4594 }
4595 
4596 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4597 // and store packed maximum values in dst.
4598 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
4600 {
4601 #if defined(__aarch64__)
4602 #if SSE2NEON_PRECISE_MINMAX
4603  float64x2_t _a = vreinterpretq_f64_m128d(a);
4604  float64x2_t _b = vreinterpretq_f64_m128d(b);
4605  return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4606 #else
4607  return vreinterpretq_m128d_f64(
4608  vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4609 #endif
4610 #else
4611  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4612  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4613  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4614  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4615  uint64_t d[2];
4616  d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4617  d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4618 
4619  return vreinterpretq_m128d_u64(vld1q_u64(d));
4620 #endif
4621 }
4622 
4623 // Compare the lower double-precision (64-bit) floating-point elements in a and
4624 // b, store the maximum value in the lower element of dst, and copy the upper
4625 // element from a to the upper element of dst.
4626 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
4628 {
4629 #if defined(__aarch64__)
4630  return _mm_move_sd(a, _mm_max_pd(a, b));
4631 #else
4632  double *da = (double *) &a;
4633  double *db = (double *) &b;
4634  double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4635  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4636 #endif
4637 }
4638 
4639 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4640 // signed 16-bit integers from b.
4641 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
4643 {
4644  return vreinterpretq_m128i_s16(
4646 }
4647 
4648 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4649 // 16 unsigned 8-bit integers from b.
4650 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
4652 {
4653  return vreinterpretq_m128i_u8(
4655 }
4656 
4657 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4658 // and store packed minimum values in dst.
4659 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
4661 {
4662 #if defined(__aarch64__)
4663 #if SSE2NEON_PRECISE_MINMAX
4664  float64x2_t _a = vreinterpretq_f64_m128d(a);
4665  float64x2_t _b = vreinterpretq_f64_m128d(b);
4666  return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4667 #else
4668  return vreinterpretq_m128d_f64(
4669  vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4670 #endif
4671 #else
4672  uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4673  uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4674  uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4675  uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4676  uint64_t d[2];
4677  d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4678  d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4679  return vreinterpretq_m128d_u64(vld1q_u64(d));
4680 #endif
4681 }
4682 
4683 // Compare the lower double-precision (64-bit) floating-point elements in a and
4684 // b, store the minimum value in the lower element of dst, and copy the upper
4685 // element from a to the upper element of dst.
4686 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
4688 {
4689 #if defined(__aarch64__)
4690  return _mm_move_sd(a, _mm_min_pd(a, b));
4691 #else
4692  double *da = (double *) &a;
4693  double *db = (double *) &b;
4694  double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4695  return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4696 #endif
4697 }
4698 
4699 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4700 // upper element.
4701 //
4702 // dst[63:0] := a[63:0]
4703 // dst[127:64] := 0
4704 //
4705 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
4707 {
4708  return vreinterpretq_m128i_s64(
4709  vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4710 }
4711 
4712 // Move the lower double-precision (64-bit) floating-point element from b to the
4713 // lower element of dst, and copy the upper element from a to the upper element
4714 // of dst.
4715 //
4716 // dst[63:0] := b[63:0]
4717 // dst[127:64] := a[127:64]
4718 //
4719 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
4721 {
4722  return vreinterpretq_m128d_f32(
4723  vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4724  vget_high_f32(vreinterpretq_f32_m128d(a))));
4725 }
4726 
4727 // NEON does not provide a version of this function.
4728 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4729 // unsigned 8-bit integers in a and zero extends the upper bits.
4730 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
4732 {
4733  // Use increasingly wide shifts+adds to collect the sign bits
4734  // together.
4735  // Since the widening shifts would be rather confusing to follow in little
4736  // endian, everything will be illustrated in big endian order instead. This
4737  // has a different result - the bits would actually be reversed on a big
4738  // endian machine.
4739 
4740  // Starting input (only half the elements are shown):
4741  // 89 ff 1d c0 00 10 99 33
4742  uint8x16_t input = vreinterpretq_u8_m128i(a);
4743 
4744  // Shift out everything but the sign bits with an unsigned shift right.
4745  //
4746  // Bytes of the vector::
4747  // 89 ff 1d c0 00 10 99 33
4748  // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4749  // | | | | | | | |
4750  // 01 01 00 01 00 00 01 00
4751  //
4752  // Bits of first important lane(s):
4753  // 10001001 (89)
4754  // \______
4755  // |
4756  // 00000001 (01)
4757  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4758 
4759  // Merge the even lanes together with a 16-bit unsigned shift right + add.
4760  // 'xx' represents garbage data which will be ignored in the final result.
4761  // In the important bytes, the add functions like a binary OR.
4762  //
4763  // 01 01 00 01 00 00 01 00
4764  // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4765  // \| \| \| \|
4766  // xx 03 xx 01 xx 00 xx 02
4767  //
4768  // 00000001 00000001 (01 01)
4769  // \_______ |
4770  // \|
4771  // xxxxxxxx xxxxxx11 (xx 03)
4772  uint32x4_t paired16 =
4773  vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4774 
4775  // Repeat with a wider 32-bit shift + add.
4776  // xx 03 xx 01 xx 00 xx 02
4777  // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4778  // 14))
4779  // \| \|
4780  // xx xx xx 0d xx xx xx 02
4781  //
4782  // 00000011 00000001 (03 01)
4783  // \\_____ ||
4784  // '----.\||
4785  // xxxxxxxx xxxx1101 (xx 0d)
4786  uint64x2_t paired32 =
4787  vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4788 
4789  // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4790  // lanes. xx xx xx 0d xx xx xx 02
4791  // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4792  // 28))
4793  // \|
4794  // xx xx xx xx xx xx xx d2
4795  //
4796  // 00001101 00000010 (0d 02)
4797  // \ \___ | |
4798  // '---. \| |
4799  // xxxxxxxx 11010010 (xx d2)
4800  uint8x16_t paired64 =
4801  vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4802 
4803  // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4804  // xx xx xx xx xx xx xx d2
4805  // || return paired64[0]
4806  // d2
4807  // Note: Little endian would return the correct value 4b (01001011) instead.
4808  return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4809 }
4810 
4811 // Set each bit of mask dst based on the most significant bit of the
4812 // corresponding packed double-precision (64-bit) floating-point element in a.
4813 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
4815 {
4816  uint64x2_t input = vreinterpretq_u64_m128d(a);
4817  uint64x2_t high_bits = vshrq_n_u64(input, 63);
4818  return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4819 }
4820 
4821 // Copy the lower 64-bit integer in a to dst.
4822 //
4823 // dst[63:0] := a[63:0]
4824 //
4825 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
4827 {
4828  return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4829 }
4830 
4831 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4832 // element.
4833 //
4834 // dst[63:0] := a[63:0]
4835 // dst[127:64] := 0
4836 //
4837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
4839 {
4840  return vreinterpretq_m128i_s64(
4841  vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4842 }
4843 
4844 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4845 // a and b, and store the unsigned 64-bit results in dst.
4846 //
4847 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4848 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
4850 {
4851  // vmull_u32 upcasts instead of masking, so we downcast.
4852  uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4853  uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4854  return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4855 }
4856 
4857 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4858 // and store the results in dst.
4859 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
4861 {
4862 #if defined(__aarch64__)
4863  return vreinterpretq_m128d_f64(
4864  vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4865 #else
4866  double *da = (double *) &a;
4867  double *db = (double *) &b;
4868  double c[2];
4869  c[0] = da[0] * db[0];
4870  c[1] = da[1] * db[1];
4871  return vld1q_f32((float32_t *) c);
4872 #endif
4873 }
4874 
4875 // Multiply the lower double-precision (64-bit) floating-point element in a and
4876 // b, store the result in the lower element of dst, and copy the upper element
4877 // from a to the upper element of dst.
4878 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
4880 {
4881  return _mm_move_sd(a, _mm_mul_pd(a, b));
4882 }
4883 
4884 // Multiply the low unsigned 32-bit integers from a and b, and store the
4885 // unsigned 64-bit result in dst.
4886 //
4887 // dst[63:0] := a[31:0] * b[31:0]
4888 //
4889 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
4891 {
4892  return vreinterpret_m64_u64(vget_low_u64(
4893  vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4894 }
4895 
4896 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4897 // integers from b.
4898 //
4899 // r0 := (a0 * b0)[31:16]
4900 // r1 := (a1 * b1)[31:16]
4901 // ...
4902 // r7 := (a7 * b7)[31:16]
4903 //
4904 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
4906 {
4907  /* FIXME: issue with large values because of result saturation */
4908  // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4909  // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4910  // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4911  int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4912  int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4913  int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4914  int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4915  int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4916  int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4917  uint16x8x2_t r =
4918  vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4919  return vreinterpretq_m128i_u16(r.val[1]);
4920 }
4921 
4922 // Multiply the packed unsigned 16-bit integers in a and b, producing
4923 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4924 // integers in dst.
4925 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
4927 {
<