Skip to content
Snippets Groups Projects
Commit 7eac805b authored by Chris Lattner's avatar Chris Lattner
Browse files

fix PR6658: inline isn't a keyword in C89 mode, use __inline__ instead.

llvm-svn: 99190
parent c53a1125
No related branches found
No related tags found
No related merge requests found
...@@ -36,417 +36,417 @@ typedef long long __m128i __attribute__((__vector_size__(16))); ...@@ -36,417 +36,417 @@ typedef long long __m128i __attribute__((__vector_size__(16)));
typedef short __v8hi __attribute__((__vector_size__(16))); typedef short __v8hi __attribute__((__vector_size__(16)));
typedef char __v16qi __attribute__((__vector_size__(16))); typedef char __v16qi __attribute__((__vector_size__(16)));
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d a, __m128d b) _mm_add_sd(__m128d a, __m128d b)
{ {
a[0] += b[0]; a[0] += b[0];
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d a, __m128d b) _mm_add_pd(__m128d a, __m128d b)
{ {
return a + b; return a + b;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d a, __m128d b) _mm_sub_sd(__m128d a, __m128d b)
{ {
a[0] -= b[0]; a[0] -= b[0];
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d a, __m128d b) _mm_sub_pd(__m128d a, __m128d b)
{ {
return a - b; return a - b;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d a, __m128d b) _mm_mul_sd(__m128d a, __m128d b)
{ {
a[0] *= b[0]; a[0] *= b[0];
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d a, __m128d b) _mm_mul_pd(__m128d a, __m128d b)
{ {
return a * b; return a * b;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d a, __m128d b) _mm_div_sd(__m128d a, __m128d b)
{ {
a[0] /= b[0]; a[0] /= b[0];
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d a, __m128d b) _mm_div_pd(__m128d a, __m128d b)
{ {
return a / b; return a / b;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d a, __m128d b) _mm_sqrt_sd(__m128d a, __m128d b)
{ {
__m128d c = __builtin_ia32_sqrtsd(b); __m128d c = __builtin_ia32_sqrtsd(b);
return (__m128d) { c[0], a[1] }; return (__m128d) { c[0], a[1] };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d a) _mm_sqrt_pd(__m128d a)
{ {
return __builtin_ia32_sqrtpd(a); return __builtin_ia32_sqrtpd(a);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d a, __m128d b) _mm_min_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_minsd(a, b); return __builtin_ia32_minsd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d a, __m128d b) _mm_min_pd(__m128d a, __m128d b)
{ {
return __builtin_ia32_minpd(a, b); return __builtin_ia32_minpd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d a, __m128d b) _mm_max_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_maxsd(a, b); return __builtin_ia32_maxsd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d a, __m128d b) _mm_max_pd(__m128d a, __m128d b)
{ {
return __builtin_ia32_maxpd(a, b); return __builtin_ia32_maxpd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d a, __m128d b) _mm_and_pd(__m128d a, __m128d b)
{ {
return (__m128d)((__v4si)a & (__v4si)b); return (__m128d)((__v4si)a & (__v4si)b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d a, __m128d b) _mm_andnot_pd(__m128d a, __m128d b)
{ {
return (__m128d)(~(__v4si)a & (__v4si)b); return (__m128d)(~(__v4si)a & (__v4si)b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d a, __m128d b) _mm_or_pd(__m128d a, __m128d b)
{ {
return (__m128d)((__v4si)a | (__v4si)b); return (__m128d)((__v4si)a | (__v4si)b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d a, __m128d b) _mm_xor_pd(__m128d a, __m128d b)
{ {
return (__m128d)((__v4si)a ^ (__v4si)b); return (__m128d)((__v4si)a ^ (__v4si)b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d a, __m128d b) _mm_cmpeq_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 0); return (__m128d)__builtin_ia32_cmppd(a, b, 0);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d a, __m128d b) _mm_cmplt_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 1); return (__m128d)__builtin_ia32_cmppd(a, b, 1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d a, __m128d b) _mm_cmple_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 2); return (__m128d)__builtin_ia32_cmppd(a, b, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d a, __m128d b) _mm_cmpgt_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(b, a, 1); return (__m128d)__builtin_ia32_cmppd(b, a, 1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d a, __m128d b) _mm_cmpge_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(b, a, 2); return (__m128d)__builtin_ia32_cmppd(b, a, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d a, __m128d b) _mm_cmpord_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 7); return (__m128d)__builtin_ia32_cmppd(a, b, 7);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d a, __m128d b) _mm_cmpunord_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 3); return (__m128d)__builtin_ia32_cmppd(a, b, 3);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d a, __m128d b) _mm_cmpneq_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 4); return (__m128d)__builtin_ia32_cmppd(a, b, 4);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d a, __m128d b) _mm_cmpnlt_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 5); return (__m128d)__builtin_ia32_cmppd(a, b, 5);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d a, __m128d b) _mm_cmpnle_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(a, b, 6); return (__m128d)__builtin_ia32_cmppd(a, b, 6);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d a, __m128d b) _mm_cmpngt_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(b, a, 5); return (__m128d)__builtin_ia32_cmppd(b, a, 5);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d a, __m128d b) _mm_cmpnge_pd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmppd(b, a, 6); return (__m128d)__builtin_ia32_cmppd(b, a, 6);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d a, __m128d b) _mm_cmpeq_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 0); return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d a, __m128d b) _mm_cmplt_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 1); return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d a, __m128d b) _mm_cmple_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 2); return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d a, __m128d b) _mm_cmpgt_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(b, a, 1); return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d a, __m128d b) _mm_cmpge_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(b, a, 2); return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d a, __m128d b) _mm_cmpord_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 7); return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d a, __m128d b) _mm_cmpunord_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 3); return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d a, __m128d b) _mm_cmpneq_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 4); return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d a, __m128d b) _mm_cmpnlt_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 5); return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d a, __m128d b) _mm_cmpnle_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(a, b, 6); return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d a, __m128d b) _mm_cmpngt_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(b, a, 5); return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d a, __m128d b) _mm_cmpnge_sd(__m128d a, __m128d b)
{ {
return (__m128d)__builtin_ia32_cmpsd(b, a, 6); return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d a, __m128d b) _mm_comieq_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_comisdeq(a, b); return __builtin_ia32_comisdeq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d a, __m128d b) _mm_comilt_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_comisdlt(a, b); return __builtin_ia32_comisdlt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d a, __m128d b) _mm_comile_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_comisdle(a, b); return __builtin_ia32_comisdle(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d a, __m128d b) _mm_comigt_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_comisdgt(a, b); return __builtin_ia32_comisdgt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d a, __m128d b) _mm_comineq_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_comisdneq(a, b); return __builtin_ia32_comisdneq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d a, __m128d b) _mm_ucomieq_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_ucomisdeq(a, b); return __builtin_ia32_ucomisdeq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d a, __m128d b) _mm_ucomilt_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_ucomisdlt(a, b); return __builtin_ia32_ucomisdlt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d a, __m128d b) _mm_ucomile_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_ucomisdle(a, b); return __builtin_ia32_ucomisdle(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d a, __m128d b) _mm_ucomigt_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_ucomisdgt(a, b); return __builtin_ia32_ucomisdgt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d a, __m128d b) _mm_ucomineq_sd(__m128d a, __m128d b)
{ {
return __builtin_ia32_ucomisdneq(a, b); return __builtin_ia32_ucomisdneq(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d a) _mm_cvtpd_ps(__m128d a)
{ {
return __builtin_ia32_cvtpd2ps(a); return __builtin_ia32_cvtpd2ps(a);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 a) _mm_cvtps_pd(__m128 a)
{ {
return __builtin_ia32_cvtps2pd(a); return __builtin_ia32_cvtps2pd(a);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i a) _mm_cvtepi32_pd(__m128i a)
{ {
return __builtin_ia32_cvtdq2pd((__v4si)a); return __builtin_ia32_cvtdq2pd((__v4si)a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d a) _mm_cvtpd_epi32(__m128d a)
{ {
return __builtin_ia32_cvtpd2dq(a); return __builtin_ia32_cvtpd2dq(a);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d a) _mm_cvtsd_si32(__m128d a)
{ {
return __builtin_ia32_cvtsd2si(a); return __builtin_ia32_cvtsd2si(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 a, __m128d b) _mm_cvtsd_ss(__m128 a, __m128d b)
{ {
a[0] = b[0]; a[0] = b[0];
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d a, int b) _mm_cvtsi32_sd(__m128d a, int b)
{ {
a[0] = b; a[0] = b;
return a; return a;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d a, __m128 b) _mm_cvtss_sd(__m128d a, __m128 b)
{ {
a[0] = b[0]; a[0] = b[0];
return a; return a;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d a) _mm_cvttpd_epi32(__m128d a)
{ {
return (__m128i)__builtin_ia32_cvttpd2dq(a); return (__m128i)__builtin_ia32_cvttpd2dq(a);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d a) _mm_cvttsd_si32(__m128d a)
{ {
return a[0]; return a[0];
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d a) _mm_cvtpd_pi32(__m128d a)
{ {
return (__m64)__builtin_ia32_cvtpd2pi(a); return (__m64)__builtin_ia32_cvtpd2pi(a);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d a) _mm_cvttpd_pi32(__m128d a)
{ {
return (__m64)__builtin_ia32_cvttpd2pi(a); return (__m64)__builtin_ia32_cvttpd2pi(a);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 a) _mm_cvtpi32_pd(__m64 a)
{ {
return __builtin_ia32_cvtpi2pd((__v2si)a); return __builtin_ia32_cvtpi2pd((__v2si)a);
} }
static inline double __attribute__((__always_inline__, __nodebug__)) static __inline__ double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d a) _mm_cvtsd_f64(__m128d a)
{ {
return a[0]; return a[0];
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const *dp) _mm_load_pd(double const *dp)
{ {
return *(__m128d*)dp; return *(__m128d*)dp;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const *dp) _mm_load1_pd(double const *dp)
{ {
return (__m128d){ dp[0], dp[0] }; return (__m128d){ dp[0], dp[0] };
...@@ -454,542 +454,542 @@ _mm_load1_pd(double const *dp) ...@@ -454,542 +454,542 @@ _mm_load1_pd(double const *dp)
#define _mm_load_pd1(dp) _mm_load1_pd(dp) #define _mm_load_pd1(dp) _mm_load1_pd(dp)
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const *dp) _mm_loadr_pd(double const *dp)
{ {
return (__m128d){ dp[1], dp[0] }; return (__m128d){ dp[1], dp[0] };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const *dp) _mm_loadu_pd(double const *dp)
{ {
return __builtin_ia32_loadupd(dp); return __builtin_ia32_loadupd(dp);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const *dp) _mm_load_sd(double const *dp)
{ {
return (__m128d){ *dp, 0.0 }; return (__m128d){ *dp, 0.0 };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d a, double const *dp) _mm_loadh_pd(__m128d a, double const *dp)
{ {
return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d a, double const *dp) _mm_loadl_pd(__m128d a, double const *dp)
{ {
return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double w) _mm_set_sd(double w)
{ {
return (__m128d){ w, 0 }; return (__m128d){ w, 0 };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double w) _mm_set1_pd(double w)
{ {
return (__m128d){ w, w }; return (__m128d){ w, w };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double w, double x) _mm_set_pd(double w, double x)
{ {
return (__m128d){ x, w }; return (__m128d){ x, w };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double w, double x) _mm_setr_pd(double w, double x)
{ {
return (__m128d){ w, x }; return (__m128d){ w, x };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void) _mm_setzero_pd(void)
{ {
return (__m128d){ 0, 0 }; return (__m128d){ 0, 0 };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d a, __m128d b) _mm_move_sd(__m128d a, __m128d b)
{ {
return (__m128d){ b[0], a[1] }; return (__m128d){ b[0], a[1] };
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double *dp, __m128d a) _mm_store_sd(double *dp, __m128d a)
{ {
dp[0] = a[0]; dp[0] = a[0];
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double *dp, __m128d a) _mm_store1_pd(double *dp, __m128d a)
{ {
dp[0] = a[0]; dp[0] = a[0];
dp[1] = a[0]; dp[1] = a[0];
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double *dp, __m128d a) _mm_store_pd(double *dp, __m128d a)
{ {
*(__m128d *)dp = a; *(__m128d *)dp = a;
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double *dp, __m128d a) _mm_storeu_pd(double *dp, __m128d a)
{ {
__builtin_ia32_storeupd(dp, a); __builtin_ia32_storeupd(dp, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double *dp, __m128d a) _mm_storer_pd(double *dp, __m128d a)
{ {
dp[0] = a[1]; dp[0] = a[1];
dp[1] = a[0]; dp[1] = a[0];
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double *dp, __m128d a) _mm_storeh_pd(double *dp, __m128d a)
{ {
dp[0] = a[1]; dp[0] = a[1];
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double *dp, __m128d a) _mm_storel_pd(double *dp, __m128d a)
{ {
dp[0] = a[0]; dp[0] = a[0];
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i a, __m128i b) _mm_add_epi8(__m128i a, __m128i b)
{ {
return (__m128i)((__v16qi)a + (__v16qi)b); return (__m128i)((__v16qi)a + (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i a, __m128i b) _mm_add_epi16(__m128i a, __m128i b)
{ {
return (__m128i)((__v8hi)a + (__v8hi)b); return (__m128i)((__v8hi)a + (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i a, __m128i b) _mm_add_epi32(__m128i a, __m128i b)
{ {
return (__m128i)((__v4si)a + (__v4si)b); return (__m128i)((__v4si)a + (__v4si)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 a, __m64 b) _mm_add_si64(__m64 a, __m64 b)
{ {
return a + b; return a + b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i a, __m128i b) _mm_add_epi64(__m128i a, __m128i b)
{ {
return a + b; return a + b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i a, __m128i b) _mm_adds_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i a, __m128i b) _mm_adds_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i a, __m128i b) _mm_adds_epu8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i a, __m128i b) _mm_adds_epu16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i a, __m128i b) _mm_avg_epu8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i a, __m128i b) _mm_avg_epu16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i a, __m128i b) _mm_madd_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i a, __m128i b) _mm_max_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i a, __m128i b) _mm_max_epu8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i a, __m128i b) _mm_min_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i a, __m128i b) _mm_min_epu8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i a, __m128i b) _mm_mulhi_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i a, __m128i b) _mm_mulhi_epu16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i a, __m128i b) _mm_mullo_epi16(__m128i a, __m128i b)
{ {
return (__m128i)((__v8hi)a * (__v8hi)b); return (__m128i)((__v8hi)a * (__v8hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 a, __m64 b) _mm_mul_su32(__m64 a, __m64 b)
{ {
return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i a, __m128i b) _mm_mul_epu32(__m128i a, __m128i b)
{ {
return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu8(__m128i a, __m128i b) _mm_sad_epu8(__m128i a, __m128i b)
{ {
return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i a, __m128i b) _mm_sub_epi8(__m128i a, __m128i b)
{ {
return (__m128i)((__v16qi)a - (__v16qi)b); return (__m128i)((__v16qi)a - (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i a, __m128i b) _mm_sub_epi16(__m128i a, __m128i b)
{ {
return (__m128i)((__v8hi)a - (__v8hi)b); return (__m128i)((__v8hi)a - (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i a, __m128i b) _mm_sub_epi32(__m128i a, __m128i b)
{ {
return (__m128i)((__v4si)a - (__v4si)b); return (__m128i)((__v4si)a - (__v4si)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 a, __m64 b) _mm_sub_si64(__m64 a, __m64 b)
{ {
return a - b; return a - b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i a, __m128i b) _mm_sub_epi64(__m128i a, __m128i b)
{ {
return a - b; return a - b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i a, __m128i b) _mm_subs_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i a, __m128i b) _mm_subs_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i a, __m128i b) _mm_subs_epu8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i a, __m128i b) _mm_subs_epu16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i a, __m128i b) _mm_and_si128(__m128i a, __m128i b)
{ {
return a & b; return a & b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i a, __m128i b) _mm_andnot_si128(__m128i a, __m128i b)
{ {
return ~a & b; return ~a & b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i a, __m128i b) _mm_or_si128(__m128i a, __m128i b)
{ {
return a | b; return a | b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i a, __m128i b) _mm_xor_si128(__m128i a, __m128i b)
{ {
return a ^ b; return a ^ b;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_si128(__m128i a, int imm) _mm_slli_si128(__m128i a, int imm)
{ {
return __builtin_ia32_pslldqi128(a, imm * 8); return __builtin_ia32_pslldqi128(a, imm * 8);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i a, int count) _mm_slli_epi16(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i a, __m128i count) _mm_sll_epi16(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i a, int count) _mm_slli_epi32(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i a, __m128i count) _mm_sll_epi32(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i a, int count) _mm_slli_epi64(__m128i a, int count)
{ {
return __builtin_ia32_psllqi128(a, count); return __builtin_ia32_psllqi128(a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i a, __m128i count) _mm_sll_epi64(__m128i a, __m128i count)
{ {
return __builtin_ia32_psllq128(a, count); return __builtin_ia32_psllq128(a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i a, int count) _mm_srai_epi16(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i a, __m128i count) _mm_sra_epi16(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i a, int count) _mm_srai_epi32(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i a, __m128i count) _mm_sra_epi32(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_si128(__m128i a, int imm) _mm_srli_si128(__m128i a, int imm)
{ {
return __builtin_ia32_psrldqi128(a, imm * 8); return __builtin_ia32_psrldqi128(a, imm * 8);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i a, int count) _mm_srli_epi16(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i a, __m128i count) _mm_srl_epi16(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i a, int count) _mm_srli_epi32(__m128i a, int count)
{ {
return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i a, __m128i count) _mm_srl_epi32(__m128i a, __m128i count)
{ {
return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i a, int count) _mm_srli_epi64(__m128i a, int count)
{ {
return __builtin_ia32_psrlqi128(a, count); return __builtin_ia32_psrlqi128(a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i a, __m128i count) _mm_srl_epi64(__m128i a, __m128i count)
{ {
return __builtin_ia32_psrlq128(a, count); return __builtin_ia32_psrlq128(a, count);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i a, __m128i b) _mm_cmpeq_epi8(__m128i a, __m128i b)
{ {
return (__m128i)((__v16qi)a == (__v16qi)b); return (__m128i)((__v16qi)a == (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i a, __m128i b) _mm_cmpeq_epi16(__m128i a, __m128i b)
{ {
return (__m128i)((__v8hi)a == (__v8hi)b); return (__m128i)((__v8hi)a == (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i a, __m128i b) _mm_cmpeq_epi32(__m128i a, __m128i b)
{ {
return (__m128i)((__v4si)a == (__v4si)b); return (__m128i)((__v4si)a == (__v4si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i a, __m128i b) _mm_cmpgt_epi8(__m128i a, __m128i b)
{ {
return (__m128i)((__v16qi)a > (__v16qi)b); return (__m128i)((__v16qi)a > (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i a, __m128i b) _mm_cmpgt_epi16(__m128i a, __m128i b)
{ {
return (__m128i)((__v8hi)a > (__v8hi)b); return (__m128i)((__v8hi)a > (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i a, __m128i b) _mm_cmpgt_epi32(__m128i a, __m128i b)
{ {
return (__m128i)((__v4si)a > (__v4si)b); return (__m128i)((__v4si)a > (__v4si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i a, __m128i b) _mm_cmplt_epi8(__m128i a, __m128i b)
{ {
return _mm_cmpgt_epi8(b,a); return _mm_cmpgt_epi8(b,a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i a, __m128i b) _mm_cmplt_epi16(__m128i a, __m128i b)
{ {
return _mm_cmpgt_epi16(b,a); return _mm_cmpgt_epi16(b,a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i a, __m128i b) _mm_cmplt_epi32(__m128i a, __m128i b)
{ {
return _mm_cmpgt_epi32(b,a); return _mm_cmpgt_epi32(b,a);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d a, long long b) _mm_cvtsi64_sd(__m128d a, long long b)
{ {
a[0] = b; a[0] = b;
return a; return a;
} }
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d a) _mm_cvtsd_si64(__m128d a)
{ {
return __builtin_ia32_cvtsd2si64(a); return __builtin_ia32_cvtsd2si64(a);
} }
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d a) _mm_cvttsd_si64(__m128d a)
{ {
return a[0]; return a[0];
} }
#endif #endif
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i a) _mm_cvtepi32_ps(__m128i a)
{ {
return __builtin_ia32_cvtdq2ps((__v4si)a); return __builtin_ia32_cvtdq2ps((__v4si)a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 a) _mm_cvtps_epi32(__m128 a)
{ {
return (__m128i)__builtin_ia32_cvtps2dq(a); return (__m128i)__builtin_ia32_cvtps2dq(a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 a) _mm_cvttps_epi32(__m128 a)
{ {
return (__m128i)__builtin_ia32_cvttps2dq(a); return (__m128i)__builtin_ia32_cvttps2dq(a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int a) _mm_cvtsi32_si128(int a)
{ {
return (__m128i)(__v4si){ a, 0, 0, 0 }; return (__m128i)(__v4si){ a, 0, 0, 0 };
} }
#ifdef __x86_64__ #ifdef __x86_64__
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long a) _mm_cvtsi64_si128(long long a)
{ {
return (__m128i){ a, 0 }; return (__m128i){ a, 0 };
} }
#endif #endif
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i a) _mm_cvtsi128_si32(__m128i a)
{ {
__v4si b = (__v4si)a; __v4si b = (__v4si)a;
...@@ -997,207 +997,207 @@ _mm_cvtsi128_si32(__m128i a) ...@@ -997,207 +997,207 @@ _mm_cvtsi128_si32(__m128i a)
} }
#ifdef __x86_64__ #ifdef __x86_64__
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i a) _mm_cvtsi128_si64(__m128i a)
{ {
return a[0]; return a[0];
} }
#endif #endif
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const *p) _mm_load_si128(__m128i const *p)
{ {
return *p; return *p;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const *p) _mm_loadu_si128(__m128i const *p)
{ {
return (__m128i)__builtin_ia32_loaddqu((char const *)p); return (__m128i)__builtin_ia32_loaddqu((char const *)p);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const *p) _mm_loadl_epi64(__m128i const *p)
{ {
return (__m128i) { *(long long*)p, 0}; return (__m128i) { *(long long*)p, 0};
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64x(long long q1, long long q0) _mm_set_epi64x(long long q1, long long q0)
{ {
return (__m128i){ q0, q1 }; return (__m128i){ q0, q1 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1, __m64 q0) _mm_set_epi64(__m64 q1, __m64 q0)
{ {
return (__m128i){ (long long)q0, (long long)q1 }; return (__m128i){ (long long)q0, (long long)q1 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3, int i2, int i1, int i0) _mm_set_epi32(int i3, int i2, int i1, int i0)
{ {
return (__m128i)(__v4si){ i0, i1, i2, i3}; return (__m128i)(__v4si){ i0, i1, i2, i3};
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
{ {
return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
{ {
return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64x(long long q) _mm_set1_epi64x(long long q)
{ {
return (__m128i){ q, q }; return (__m128i){ q, q };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 q) _mm_set1_epi64(__m64 q)
{ {
return (__m128i){ (long long)q, (long long)q }; return (__m128i){ (long long)q, (long long)q };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int i) _mm_set1_epi32(int i)
{ {
return (__m128i)(__v4si){ i, i, i, i }; return (__m128i)(__v4si){ i, i, i, i };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short w) _mm_set1_epi16(short w)
{ {
return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char b) _mm_set1_epi8(char b)
{ {
return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0, __m64 q1) _mm_setr_epi64(__m64 q0, __m64 q1)
{ {
return (__m128i){ (long long)q0, (long long)q1 }; return (__m128i){ (long long)q0, (long long)q1 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0, int i1, int i2, int i3) _mm_setr_epi32(int i0, int i1, int i2, int i3)
{ {
return (__m128i)(__v4si){ i0, i1, i2, i3}; return (__m128i)(__v4si){ i0, i1, i2, i3};
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
{ {
return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
{ {
return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void) _mm_setzero_si128(void)
{ {
return (__m128i){ 0LL, 0LL }; return (__m128i){ 0LL, 0LL };
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i *p, __m128i b) _mm_store_si128(__m128i *p, __m128i b)
{ {
*p = b; *p = b;
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i *p, __m128i b) _mm_storeu_si128(__m128i *p, __m128i b)
{ {
__builtin_ia32_storedqu((char *)p, (__v16qi)b); __builtin_ia32_storedqu((char *)p, (__v16qi)b);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
{ {
__builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i *p, __m128i a) _mm_storel_epi64(__m128i *p, __m128i a)
{ {
__builtin_ia32_storelv4si((__v2si *)p, a); __builtin_ia32_storelv4si((__v2si *)p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double *p, __m128d a) _mm_stream_pd(double *p, __m128d a)
{ {
__builtin_ia32_movntpd(p, a); __builtin_ia32_movntpd(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i *p, __m128i a) _mm_stream_si128(__m128i *p, __m128i a)
{ {
__builtin_ia32_movntdq(p, a); __builtin_ia32_movntdq(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int *p, int a) _mm_stream_si32(int *p, int a)
{ {
__builtin_ia32_movnti(p, a); __builtin_ia32_movnti(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const *p) _mm_clflush(void const *p)
{ {
__builtin_ia32_clflush(p); __builtin_ia32_clflush(p);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void) _mm_lfence(void)
{ {
__builtin_ia32_lfence(); __builtin_ia32_lfence();
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void) _mm_mfence(void)
{ {
__builtin_ia32_mfence(); __builtin_ia32_mfence();
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i a, __m128i b) _mm_packs_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i a, __m128i b) _mm_packs_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i a, __m128i b) _mm_packus_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i a, int imm) _mm_extract_epi16(__m128i a, int imm)
{ {
__v8hi b = (__v8hi)a; __v8hi b = (__v8hi)a;
return b[imm]; return b[imm];
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i a, int b, int imm) _mm_insert_epi16(__m128i a, int b, int imm)
{ {
__v8hi c = (__v8hi)a; __v8hi c = (__v8hi)a;
...@@ -1205,7 +1205,7 @@ _mm_insert_epi16(__m128i a, int b, int imm) ...@@ -1205,7 +1205,7 @@ _mm_insert_epi16(__m128i a, int b, int imm)
return (__m128i)c; return (__m128i)c;
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i a) _mm_movemask_epi8(__m128i a)
{ {
return __builtin_ia32_pmovmskb128((__v16qi)a); return __builtin_ia32_pmovmskb128((__v16qi)a);
...@@ -1226,85 +1226,85 @@ _mm_movemask_epi8(__m128i a) ...@@ -1226,85 +1226,85 @@ _mm_movemask_epi8(__m128i a)
4 + ((imm) & 0x30) >> 4, \ 4 + ((imm) & 0x30) >> 4, \
4 + ((imm) & 0xc0) >> 6)) 4 + ((imm) & 0xc0) >> 6))
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i a, __m128i b) _mm_unpackhi_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i a, __m128i b) _mm_unpackhi_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i a, __m128i b) _mm_unpackhi_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i a, __m128i b) _mm_unpackhi_epi64(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i a, __m128i b) _mm_unpacklo_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i a, __m128i b) _mm_unpacklo_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i a, __m128i b) _mm_unpacklo_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i a, __m128i b) _mm_unpacklo_epi64(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i a) _mm_movepi64_pi64(__m128i a)
{ {
return (__m64)a[0]; return (__m64)a[0];
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 a) _mm_movpi64_pi64(__m64 a)
{ {
return (__m128i){ (long long)a, 0 }; return (__m128i){ (long long)a, 0 };
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i a) _mm_move_epi64(__m128i a)
{ {
return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d a, __m128d b) _mm_unpackhi_pd(__m128d a, __m128d b)
{ {
return __builtin_shufflevector(a, b, 1, 2+1); return __builtin_shufflevector(a, b, 1, 2+1);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d a, __m128d b) _mm_unpacklo_pd(__m128d a, __m128d b)
{ {
return __builtin_shufflevector(a, b, 0, 2+0); return __builtin_shufflevector(a, b, 0, 2+0);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d a) _mm_movemask_pd(__m128d a)
{ {
return __builtin_ia32_movmskpd(a); return __builtin_ia32_movmskpd(a);
...@@ -1313,43 +1313,43 @@ _mm_movemask_pd(__m128d a) ...@@ -1313,43 +1313,43 @@ _mm_movemask_pd(__m128d a)
#define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \ #define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \
(((i) & 2) >> 1) + 2)) (((i) & 2) >> 1) + 2))
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d in) _mm_castpd_ps(__m128d in)
{ {
return (__m128)in; return (__m128)in;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d in) _mm_castpd_si128(__m128d in)
{ {
return (__m128i)in; return (__m128i)in;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 in) _mm_castps_pd(__m128 in)
{ {
return (__m128d)in; return (__m128d)in;
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 in) _mm_castps_si128(__m128 in)
{ {
return (__m128i)in; return (__m128i)in;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i in) _mm_castsi128_ps(__m128i in)
{ {
return (__m128)in; return (__m128)in;
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i in) _mm_castsi128_pd(__m128i in)
{ {
return (__m128d)in; return (__m128d)in;
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void) _mm_pause(void)
{ {
__asm__ volatile ("pause"); __asm__ volatile ("pause");
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include <errno.h> #include <errno.h>
#include <stdlib.h> #include <stdlib.h>
static inline void *__attribute__((__always_inline__, __nodebug__)) _mm_malloc(size_t size, size_t align) static __inline__ void *__attribute__((__always_inline__, __nodebug__)) _mm_malloc(size_t size, size_t align)
{ {
if (align & (align - 1)) { if (align & (align - 1)) {
errno = EINVAL; errno = EINVAL;
...@@ -50,7 +50,7 @@ static inline void *__attribute__((__always_inline__, __nodebug__)) _mm_malloc(s ...@@ -50,7 +50,7 @@ static inline void *__attribute__((__always_inline__, __nodebug__)) _mm_malloc(s
return alignedMemory; return alignedMemory;
} }
static inline void __attribute__((__always_inline__, __nodebug__)) _mm_free(void *p) static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_free(void *p)
{ {
if (p) if (p)
free(((void **)p)[-1]); free(((void **)p)[-1]);
......
...@@ -34,409 +34,409 @@ typedef int __v2si __attribute__((__vector_size__(8))); ...@@ -34,409 +34,409 @@ typedef int __v2si __attribute__((__vector_size__(8)));
typedef short __v4hi __attribute__((__vector_size__(8))); typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8)));
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_empty(void) _mm_empty(void)
{ {
__builtin_ia32_emms(); __builtin_ia32_emms();
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si64(int __i) _mm_cvtsi32_si64(int __i)
{ {
return (__m64)(__v2si){__i, 0}; return (__m64)(__v2si){__i, 0};
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si32(__m64 __m) _mm_cvtsi64_si32(__m64 __m)
{ {
__v2si __mmx_var2 = (__v2si)__m; __v2si __mmx_var2 = (__v2si)__m;
return __mmx_var2[0]; return __mmx_var2[0];
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_m64(long long __i) _mm_cvtsi64_m64(long long __i)
{ {
return (__m64)__i; return (__m64)__i;
} }
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtm64_si64(__m64 __m) _mm_cvtm64_si64(__m64 __m)
{ {
return (long long)__m; return (long long)__m;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_packs_pi16(__m64 __m1, __m64 __m2) _mm_packs_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_packs_pi32(__m64 __m1, __m64 __m2) _mm_packs_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_packs_pu16(__m64 __m1, __m64 __m2) _mm_packs_pu16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 8+4, 5, return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 8+4, 5,
8+5, 6, 8+6, 7, 8+7); 8+5, 6, 8+6, 7, 8+7);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 4+2, 3, return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 4+2, 3,
4+3); 4+3);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 2+1); return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 2+1);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8+0, 1, return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8+0, 1,
8+1, 2, 8+2, 3, 8+3); 8+1, 2, 8+2, 3, 8+3);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4+0, 1, return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4+0, 1,
4+1); 4+1);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2+0); return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2+0);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_pi8(__m64 __m1, __m64 __m2) _mm_add_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v8qi)__m1 + (__v8qi)__m2); return (__m64)((__v8qi)__m1 + (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_pi16(__m64 __m1, __m64 __m2) _mm_add_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v4hi)__m1 + (__v4hi)__m2); return (__m64)((__v4hi)__m1 + (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_pi32(__m64 __m1, __m64 __m2) _mm_add_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v2si)__m1 + (__v2si)__m2); return (__m64)((__v2si)__m1 + (__v2si)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_adds_pi8(__m64 __m1, __m64 __m2) _mm_adds_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_adds_pi16(__m64 __m1, __m64 __m2) _mm_adds_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_adds_pu8(__m64 __m1, __m64 __m2) _mm_adds_pu8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_adds_pu16(__m64 __m1, __m64 __m2) _mm_adds_pu16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_pi8(__m64 __m1, __m64 __m2) _mm_sub_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v8qi)__m1 - (__v8qi)__m2); return (__m64)((__v8qi)__m1 - (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_pi16(__m64 __m1, __m64 __m2) _mm_sub_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v4hi)__m1 - (__v4hi)__m2); return (__m64)((__v4hi)__m1 - (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_pi32(__m64 __m1, __m64 __m2) _mm_sub_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v2si)__m1 - (__v2si)__m2); return (__m64)((__v2si)__m1 - (__v2si)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_subs_pi8(__m64 __m1, __m64 __m2) _mm_subs_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_subs_pi16(__m64 __m1, __m64 __m2) _mm_subs_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_subs_pu8(__m64 __m1, __m64 __m2) _mm_subs_pu8(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_subs_pu16(__m64 __m1, __m64 __m2) _mm_subs_pu16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_madd_pi16(__m64 __m1, __m64 __m2) _mm_madd_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_pi16(__m64 __m1, __m64 __m2) _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mullo_pi16(__m64 __m1, __m64 __m2) _mm_mullo_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v4hi)__m1 * (__v4hi)__m2); return (__m64)((__v4hi)__m1 * (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sll_pi16(__m64 __m, __m64 __count) _mm_sll_pi16(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_slli_pi16(__m64 __m, int __count) _mm_slli_pi16(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sll_pi32(__m64 __m, __m64 __count) _mm_sll_pi32(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_slli_pi32(__m64 __m, int __count) _mm_slli_pi32(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sll_si64(__m64 __m, __m64 __count) _mm_sll_si64(__m64 __m, __m64 __count)
{ {
return __builtin_ia32_psllq(__m, __count); return __builtin_ia32_psllq(__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_slli_si64(__m64 __m, int __count) _mm_slli_si64(__m64 __m, int __count)
{ {
return __builtin_ia32_psllqi(__m, __count); return __builtin_ia32_psllqi(__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sra_pi16(__m64 __m, __m64 __count) _mm_sra_pi16(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srai_pi16(__m64 __m, int __count) _mm_srai_pi16(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sra_pi32(__m64 __m, __m64 __count) _mm_sra_pi32(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srai_pi32(__m64 __m, int __count) _mm_srai_pi32(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srl_pi16(__m64 __m, __m64 __count) _mm_srl_pi16(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srli_pi16(__m64 __m, int __count) _mm_srli_pi16(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srl_pi32(__m64 __m, __m64 __count) _mm_srl_pi32(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srli_pi32(__m64 __m, int __count) _mm_srli_pi32(__m64 __m, int __count)
{ {
return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srl_si64(__m64 __m, __m64 __count) _mm_srl_si64(__m64 __m, __m64 __count)
{ {
return (__m64)__builtin_ia32_psrlq(__m, __count); return (__m64)__builtin_ia32_psrlq(__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_srli_si64(__m64 __m, int __count) _mm_srli_si64(__m64 __m, int __count)
{ {
return __builtin_ia32_psrlqi(__m, __count); return __builtin_ia32_psrlqi(__m, __count);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_and_si64(__m64 __m1, __m64 __m2) _mm_and_si64(__m64 __m1, __m64 __m2)
{ {
return __m1 & __m2; return __m1 & __m2;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si64(__m64 __m1, __m64 __m2) _mm_andnot_si64(__m64 __m1, __m64 __m2)
{ {
return ~__m1 & __m2; return ~__m1 & __m2;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_or_si64(__m64 __m1, __m64 __m2) _mm_or_si64(__m64 __m1, __m64 __m2)
{ {
return __m1 | __m2; return __m1 | __m2;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_xor_si64(__m64 __m1, __m64 __m2) _mm_xor_si64(__m64 __m1, __m64 __m2)
{ {
return __m1 ^ __m2; return __m1 ^ __m2;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v8qi)__m1 == (__v8qi)__m2); return (__m64)((__v8qi)__m1 == (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v4hi)__m1 == (__v4hi)__m2); return (__m64)((__v4hi)__m1 == (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v2si)__m1 == (__v2si)__m2); return (__m64)((__v2si)__m1 == (__v2si)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v8qi)__m1 > (__v8qi)__m2); return (__m64)((__v8qi)__m1 > (__v8qi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v4hi)__m1 > (__v4hi)__m2); return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
{ {
return (__m64)((__v2si)__m1 > (__v2si)__m2); return (__m64)((__v2si)__m1 > (__v2si)__m2);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si64(void) _mm_setzero_si64(void)
{ {
return (__m64){ 0LL }; return (__m64){ 0LL };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set_pi32(int __i1, int __i0) _mm_set_pi32(int __i1, int __i0)
{ {
return (__m64)(__v2si){ __i0, __i1 }; return (__m64)(__v2si){ __i0, __i1 };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
{ {
return (__m64)(__v4hi){ __s0, __s1, __s2, __s3 }; return (__m64)(__v4hi){ __s0, __s1, __s2, __s3 };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
char __b1, char __b0) char __b1, char __b0)
{ {
return (__m64)(__v8qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7 }; return (__m64)(__v8qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7 };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set1_pi32(int __i) _mm_set1_pi32(int __i)
{ {
return (__m64)(__v2si){ __i, __i }; return (__m64)(__v2si){ __i, __i };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set1_pi16(short __s) _mm_set1_pi16(short __s)
{ {
return (__m64)(__v4hi){ __s, __s, __s, __s }; return (__m64)(__v4hi){ __s, __s, __s, __s };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_set1_pi8(char __b) _mm_set1_pi8(char __b)
{ {
return (__m64)(__v8qi){ __b, __b, __b, __b, __b, __b, __b, __b }; return (__m64)(__v8qi){ __b, __b, __b, __b, __b, __b, __b, __b };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_setr_pi32(int __i1, int __i0) _mm_setr_pi32(int __i1, int __i0)
{ {
return (__m64)(__v2si){ __i1, __i0 }; return (__m64)(__v2si){ __i1, __i0 };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_setr_pi16(short __s3, short __s2, short __s1, short __s0) _mm_setr_pi16(short __s3, short __s2, short __s1, short __s0)
{ {
return (__m64)(__v4hi){ __s3, __s2, __s1, __s0 }; return (__m64)(__v4hi){ __s3, __s2, __s1, __s0 };
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, _mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
char __b1, char __b0) char __b1, char __b0)
{ {
......
...@@ -30,67 +30,67 @@ ...@@ -30,67 +30,67 @@
#include <emmintrin.h> #include <emmintrin.h>
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_lddqu_si128(__m128i const *p) _mm_lddqu_si128(__m128i const *p)
{ {
return (__m128i)__builtin_ia32_lddqu((char const *)p); return (__m128i)__builtin_ia32_lddqu((char const *)p);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_addsub_ps(__m128 a, __m128 b) _mm_addsub_ps(__m128 a, __m128 b)
{ {
return __builtin_ia32_addsubps(a, b); return __builtin_ia32_addsubps(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_hadd_ps(__m128 a, __m128 b) _mm_hadd_ps(__m128 a, __m128 b)
{ {
return __builtin_ia32_haddps(a, b); return __builtin_ia32_haddps(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_hsub_ps(__m128 a, __m128 b) _mm_hsub_ps(__m128 a, __m128 b)
{ {
return __builtin_ia32_hsubps(a, b); return __builtin_ia32_hsubps(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_movehdup_ps(__m128 a) _mm_movehdup_ps(__m128 a)
{ {
return __builtin_shufflevector(a, a, 1, 1, 3, 3); return __builtin_shufflevector(a, a, 1, 1, 3, 3);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_moveldup_ps(__m128 a) _mm_moveldup_ps(__m128 a)
{ {
return __builtin_shufflevector(a, a, 0, 0, 2, 2); return __builtin_shufflevector(a, a, 0, 0, 2, 2);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_addsub_pd(__m128d a, __m128d b) _mm_addsub_pd(__m128d a, __m128d b)
{ {
return __builtin_ia32_addsubpd(a, b); return __builtin_ia32_addsubpd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_hadd_pd(__m128d a, __m128d b) _mm_hadd_pd(__m128d a, __m128d b)
{ {
return __builtin_ia32_haddpd(a, b); return __builtin_ia32_haddpd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_hsub_pd(__m128d a, __m128d b) _mm_hsub_pd(__m128d a, __m128d b)
{ {
return __builtin_ia32_hsubpd(a, b); return __builtin_ia32_hsubpd(a, b);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loaddup_pd(double const *dp) _mm_loaddup_pd(double const *dp)
{ {
return (__m128d){ *dp, *dp }; return (__m128d){ *dp, *dp };
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_movedup_pd(__m128d a) _mm_movedup_pd(__m128d a)
{ {
return __builtin_shufflevector(a, a, 0, 0); return __builtin_shufflevector(a, a, 0, 0);
...@@ -104,13 +104,13 @@ _mm_movedup_pd(__m128d a) ...@@ -104,13 +104,13 @@ _mm_movedup_pd(__m128d a)
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_monitor(void const *p, unsigned extensions, unsigned hints) _mm_monitor(void const *p, unsigned extensions, unsigned hints)
{ {
__builtin_ia32_monitor((void *)p, extensions, hints); __builtin_ia32_monitor((void *)p, extensions, hints);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_mwait(unsigned extensions, unsigned hints) _mm_mwait(unsigned extensions, unsigned hints)
{ {
__builtin_ia32_mwait(extensions, hints); __builtin_ia32_mwait(extensions, hints);
......
...@@ -67,53 +67,53 @@ typedef long long __v2di __attribute__ ((__vector_size__ (16))); ...@@ -67,53 +67,53 @@ typedef long long __v2di __attribute__ ((__vector_size__ (16)));
#define _mm_round_sd(X, Y, M) __builtin_ia32_roundsd((X), (Y), (M)) #define _mm_round_sd(X, Y, M) __builtin_ia32_roundsd((X), (Y), (M))
/* SSE4 Packed Blending Intrinsics. */ /* SSE4 Packed Blending Intrinsics. */
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_blend_pd (__m128d __V1, __m128d __V2, const int __M) _mm_blend_pd (__m128d __V1, __m128d __V2, const int __M)
{ {
return (__m128d) __builtin_ia32_blendpd ((__v2df)__V1, (__v2df)__V2, __M); return (__m128d) __builtin_ia32_blendpd ((__v2df)__V1, (__v2df)__V2, __M);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_blend_ps (__m128 __V1, __m128 __V2, const int __M) _mm_blend_ps (__m128 __V1, __m128 __V2, const int __M)
{ {
return (__m128) __builtin_ia32_blendps ((__v4sf)__V1, (__v4sf)__V2, __M); return (__m128) __builtin_ia32_blendps ((__v4sf)__V1, (__v4sf)__V2, __M);
} }
static inline __m128d __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
{ {
return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
(__v2df)__M); (__v2df)__M);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
{ {
return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
(__v4sf)__M); (__v4sf)__M);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
{ {
return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
(__v16qi)__M); (__v16qi)__M);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_blend_epi16 (__m128i __V1, __m128i __V2, const int __M) _mm_blend_epi16 (__m128i __V1, __m128i __V2, const int __M)
{ {
return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__V1, (__v8hi)__V2, __M); return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__V1, (__v8hi)__V2, __M);
} }
/* SSE4 Dword Multiply Instructions. */ /* SSE4 Dword Multiply Instructions. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi32 (__m128i __V1, __m128i __V2) _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmulld128((__v4si)__V1, (__v4si)__V2); return (__m128i) __builtin_ia32_pmulld128((__v4si)__V1, (__v4si)__V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epi32 (__m128i __V1, __m128i __V2) _mm_mul_epi32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
...@@ -124,56 +124,56 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) ...@@ -124,56 +124,56 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
#define _mm_dp_pd(X, Y, M) __builtin_ia32_dppd ((X), (Y), (M)) #define _mm_dp_pd(X, Y, M) __builtin_ia32_dppd ((X), (Y), (M))
/* SSE4 Streaming Load Hint Instruction. */ /* SSE4 Streaming Load Hint Instruction. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_stream_load_si128 (__m128i *__V) _mm_stream_load_si128 (__m128i *__V)
{ {
return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V); return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
} }
/* SSE4 Packed Integer Min/Max Instructions. */ /* SSE4 Packed Integer Min/Max Instructions. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi8 (__m128i __V1, __m128i __V2) _mm_min_epi8 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi8 (__m128i __V1, __m128i __V2) _mm_max_epi8 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu16 (__m128i __V1, __m128i __V2) _mm_min_epu16 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu16 (__m128i __V1, __m128i __V2) _mm_max_epu16 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi32 (__m128i __V1, __m128i __V2) _mm_min_epi32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi32 (__m128i __V1, __m128i __V2) _mm_max_epi32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu32 (__m128i __V1, __m128i __V2) _mm_min_epu32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu32 (__m128i __V1, __m128i __V2) _mm_max_epu32 (__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
...@@ -224,19 +224,19 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) ...@@ -224,19 +224,19 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
#endif /* __x86_64 */ #endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */ /* SSE4 128-bit Packed Integer Comparisons. */
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testz_si128(__m128i __M, __m128i __V) _mm_testz_si128(__m128i __M, __m128i __V)
{ {
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testc_si128(__m128i __M, __m128i __V) _mm_testc_si128(__m128i __M, __m128i __V)
{ {
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_testnzc_si128(__m128i __M, __m128i __V) _mm_testnzc_si128(__m128i __M, __m128i __V)
{ {
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
...@@ -247,88 +247,88 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) ...@@ -247,88 +247,88 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((V), (V)) #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((V), (V))
/* SSE4 64-bit Packed Integer Comparisons. */ /* SSE4 64-bit Packed Integer Comparisons. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_pcmpeqq((__v2di)__V1, (__v2di)__V2); return (__m128i) __builtin_ia32_pcmpeqq((__v2di)__V1, (__v2di)__V2);
} }
/* SSE4 Packed Integer Sign-Extension. */ /* SSE4 Packed Integer Sign-Extension. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi16(__m128i __V) _mm_cvtepi8_epi16(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V); return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi32(__m128i __V) _mm_cvtepi8_epi32(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V); return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi8_epi64(__m128i __V) _mm_cvtepi8_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V); return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi16_epi32(__m128i __V) _mm_cvtepi16_epi32(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V); return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi16_epi64(__m128i __V) _mm_cvtepi16_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V); return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_epi64(__m128i __V) _mm_cvtepi32_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V); return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
} }
/* SSE4 Packed Integer Zero-Extension. */ /* SSE4 Packed Integer Zero-Extension. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi16(__m128i __V) _mm_cvtepu8_epi16(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V); return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi32(__m128i __V) _mm_cvtepu8_epi32(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V); return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu8_epi64(__m128i __V) _mm_cvtepu8_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V); return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu16_epi32(__m128i __V) _mm_cvtepu16_epi32(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V); return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu16_epi64(__m128i __V) _mm_cvtepu16_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V); return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtepu32_epi64(__m128i __V) _mm_cvtepu32_epi64(__m128i __V)
{ {
return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V); return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
} }
/* SSE4 Pack with Unsigned Saturation. */ /* SSE4 Pack with Unsigned Saturation. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi32(__m128i __V1, __m128i __V2) _mm_packus_epi32(__m128i __V1, __m128i __V2)
{ {
return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
...@@ -400,33 +400,33 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) ...@@ -400,33 +400,33 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
__builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M)) __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
/* SSE4.2 Compare Packed Data -- Greater Than. */ /* SSE4.2 Compare Packed Data -- Greater Than. */
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
{ {
return __builtin_ia32_pcmpgtq((__v2di)__V1, (__v2di)__V2); return __builtin_ia32_pcmpgtq((__v2di)__V1, (__v2di)__V2);
} }
/* SSE4.2 Accumulate CRC32. */ /* SSE4.2 Accumulate CRC32. */
static inline unsigned int __attribute__((__always_inline__, __nodebug__)) static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u8(unsigned int __C, unsigned char __D) _mm_crc32_u8(unsigned int __C, unsigned char __D)
{ {
return __builtin_ia32_crc32qi(__C, __D); return __builtin_ia32_crc32qi(__C, __D);
} }
static inline unsigned int __attribute__((__always_inline__, __nodebug__)) static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u16(unsigned int __C, unsigned short __D) _mm_crc32_u16(unsigned int __C, unsigned short __D)
{ {
return __builtin_ia32_crc32hi(__C, __D); return __builtin_ia32_crc32hi(__C, __D);
} }
static inline unsigned int __attribute__((__always_inline__, __nodebug__)) static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u32(unsigned int __C, unsigned int __D) _mm_crc32_u32(unsigned int __C, unsigned int __D)
{ {
return __builtin_ia32_crc32si(__C, __D); return __builtin_ia32_crc32si(__C, __D);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static inline unsigned long long __attribute__((__always_inline__, __nodebug__)) static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
_mm_crc32_u64(unsigned long long __C, unsigned long long __D) _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
{ {
return __builtin_ia32_crc32di(__C, __D); return __builtin_ia32_crc32di(__C, __D);
...@@ -434,14 +434,14 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D) ...@@ -434,14 +434,14 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
#endif /* __x86_64__ */ #endif /* __x86_64__ */
/* SSE4.2 Population Count. */ /* SSE4.2 Population Count. */
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_popcnt_u32(unsigned int __A) _mm_popcnt_u32(unsigned int __A)
{ {
return __builtin_popcount(__A); return __builtin_popcount(__A);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_popcnt_u64(unsigned long long __A) _mm_popcnt_u64(unsigned long long __A)
{ {
return __builtin_popcountll(__A); return __builtin_popcountll(__A);
......
...@@ -30,37 +30,37 @@ ...@@ -30,37 +30,37 @@
#include <pmmintrin.h> #include <pmmintrin.h>
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_abs_pi8(__m64 a) _mm_abs_pi8(__m64 a)
{ {
return (__m64)__builtin_ia32_pabsb((__v8qi)a); return (__m64)__builtin_ia32_pabsb((__v8qi)a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_abs_epi8(__m128i a) _mm_abs_epi8(__m128i a)
{ {
return (__m128i)__builtin_ia32_pabsb128((__v16qi)a); return (__m128i)__builtin_ia32_pabsb128((__v16qi)a);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_abs_pi16(__m64 a) _mm_abs_pi16(__m64 a)
{ {
return (__m64)__builtin_ia32_pabsw((__v4hi)a); return (__m64)__builtin_ia32_pabsw((__v4hi)a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_abs_epi16(__m128i a) _mm_abs_epi16(__m128i a)
{ {
return (__m128i)__builtin_ia32_pabsw128((__v8hi)a); return (__m128i)__builtin_ia32_pabsw128((__v8hi)a);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_abs_pi32(__m64 a) _mm_abs_pi32(__m64 a)
{ {
return (__m64)__builtin_ia32_pabsd((__v2si)a); return (__m64)__builtin_ia32_pabsd((__v2si)a);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_abs_epi32(__m128i a) _mm_abs_epi32(__m128i a)
{ {
return (__m128i)__builtin_ia32_pabsd128((__v4si)a); return (__m128i)__builtin_ia32_pabsd128((__v4si)a);
...@@ -69,145 +69,145 @@ _mm_abs_epi32(__m128i a) ...@@ -69,145 +69,145 @@ _mm_abs_epi32(__m128i a)
#define _mm_alignr_epi8(a, b, n) (__builtin_ia32_palignr128((a), (b), (n))) #define _mm_alignr_epi8(a, b, n) (__builtin_ia32_palignr128((a), (b), (n)))
#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n*8))) #define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n*8)))
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hadd_epi16(__m128i a, __m128i b) _mm_hadd_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phaddw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_phaddw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hadd_epi32(__m128i a, __m128i b) _mm_hadd_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phaddd128((__v4si)a, (__v4si)b); return (__m128i)__builtin_ia32_phaddd128((__v4si)a, (__v4si)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hadd_pi16(__m64 a, __m64 b) _mm_hadd_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phaddw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_phaddw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hadd_pi32(__m64 a, __m64 b) _mm_hadd_pi32(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phaddd((__v2si)a, (__v2si)b); return (__m64)__builtin_ia32_phaddd((__v2si)a, (__v2si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hadds_epi16(__m128i a, __m128i b) _mm_hadds_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_phaddsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hadds_pi16(__m64 a, __m64 b) _mm_hadds_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phaddsw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_phaddsw((__v4hi)a, (__v4hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hsub_epi16(__m128i a, __m128i b) _mm_hsub_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phsubw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_phsubw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hsub_epi32(__m128i a, __m128i b) _mm_hsub_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phsubd128((__v4si)a, (__v4si)b); return (__m128i)__builtin_ia32_phsubd128((__v4si)a, (__v4si)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hsub_pi16(__m64 a, __m64 b) _mm_hsub_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phsubw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_phsubw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hsub_pi32(__m64 a, __m64 b) _mm_hsub_pi32(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phsubd((__v2si)a, (__v2si)b); return (__m64)__builtin_ia32_phsubd((__v2si)a, (__v2si)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_hsubs_epi16(__m128i a, __m128i b) _mm_hsubs_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_phsubsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_hsubs_pi16(__m64 a, __m64 b) _mm_hsubs_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_phsubsw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_phsubsw((__v4hi)a, (__v4hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_maddubs_epi16(__m128i a, __m128i b) _mm_maddubs_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)a, (__v16qi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_maddubs_pi16(__m64 a, __m64 b) _mm_maddubs_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_pmaddubsw((__v8qi)a, (__v8qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhrs_epi16(__m128i a, __m128i b) _mm_mulhrs_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)a, (__v8hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mulhrs_pi16(__m64 a, __m64 b) _mm_mulhrs_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_pmulhrsw((__v4hi)a, (__v4hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_shuffle_epi8(__m128i a, __m128i b) _mm_shuffle_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_pshufb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_pshufb128((__v16qi)a, (__v16qi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_shuffle_pi8(__m64 a, __m64 b) _mm_shuffle_pi8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pshufb((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_pshufb((__v8qi)a, (__v8qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sign_epi8(__m128i a, __m128i b) _mm_sign_epi8(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psignb128((__v16qi)a, (__v16qi)b); return (__m128i)__builtin_ia32_psignb128((__v16qi)a, (__v16qi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sign_epi16(__m128i a, __m128i b) _mm_sign_epi16(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psignw128((__v8hi)a, (__v8hi)b); return (__m128i)__builtin_ia32_psignw128((__v8hi)a, (__v8hi)b);
} }
static inline __m128i __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sign_epi32(__m128i a, __m128i b) _mm_sign_epi32(__m128i a, __m128i b)
{ {
return (__m128i)__builtin_ia32_psignd128((__v4si)a, (__v4si)b); return (__m128i)__builtin_ia32_psignd128((__v4si)a, (__v4si)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sign_pi8(__m64 a, __m64 b) _mm_sign_pi8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_psignb((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_psignb((__v8qi)a, (__v8qi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sign_pi16(__m64 a, __m64 b) _mm_sign_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_psignw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_psignw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sign_pi32(__m64 a, __m64 b) _mm_sign_pi32(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_psignd((__v2si)a, (__v2si)b); return (__m64)__builtin_ia32_psignd((__v2si)a, (__v2si)b);
......
...@@ -36,365 +36,365 @@ typedef float __m128 __attribute__((__vector_size__(16))); ...@@ -36,365 +36,365 @@ typedef float __m128 __attribute__((__vector_size__(16)));
#include <mm_malloc.h> #include <mm_malloc.h>
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_add_ss(__m128 a, __m128 b) _mm_add_ss(__m128 a, __m128 b)
{ {
a[0] += b[0]; a[0] += b[0];
return a; return a;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_add_ps(__m128 a, __m128 b) _mm_add_ps(__m128 a, __m128 b)
{ {
return a + b; return a + b;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sub_ss(__m128 a, __m128 b) _mm_sub_ss(__m128 a, __m128 b)
{ {
a[0] -= b[0]; a[0] -= b[0];
return a; return a;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sub_ps(__m128 a, __m128 b) _mm_sub_ps(__m128 a, __m128 b)
{ {
return a - b; return a - b;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_mul_ss(__m128 a, __m128 b) _mm_mul_ss(__m128 a, __m128 b)
{ {
a[0] *= b[0]; a[0] *= b[0];
return a; return a;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_mul_ps(__m128 a, __m128 b) _mm_mul_ps(__m128 a, __m128 b)
{ {
return a * b; return a * b;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_div_ss(__m128 a, __m128 b) _mm_div_ss(__m128 a, __m128 b)
{ {
a[0] /= b[0]; a[0] /= b[0];
return a; return a;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_div_ps(__m128 a, __m128 b) _mm_div_ps(__m128 a, __m128 b)
{ {
return a / b; return a / b;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_ss(__m128 a) _mm_sqrt_ss(__m128 a)
{ {
return __builtin_ia32_sqrtss(a); return __builtin_ia32_sqrtss(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_ps(__m128 a) _mm_sqrt_ps(__m128 a)
{ {
return __builtin_ia32_sqrtps(a); return __builtin_ia32_sqrtps(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rcp_ss(__m128 a) _mm_rcp_ss(__m128 a)
{ {
return __builtin_ia32_rcpss(a); return __builtin_ia32_rcpss(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rcp_ps(__m128 a) _mm_rcp_ps(__m128 a)
{ {
return __builtin_ia32_rcpps(a); return __builtin_ia32_rcpps(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt_ss(__m128 a) _mm_rsqrt_ss(__m128 a)
{ {
return __builtin_ia32_rsqrtss(a); return __builtin_ia32_rsqrtss(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt_ps(__m128 a) _mm_rsqrt_ps(__m128 a)
{ {
return __builtin_ia32_rsqrtps(a); return __builtin_ia32_rsqrtps(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_min_ss(__m128 a, __m128 b) _mm_min_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_minss(a, b); return __builtin_ia32_minss(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_min_ps(__m128 a, __m128 b) _mm_min_ps(__m128 a, __m128 b)
{ {
return __builtin_ia32_minps(a, b); return __builtin_ia32_minps(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_max_ss(__m128 a, __m128 b) _mm_max_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_maxss(a, b); return __builtin_ia32_maxss(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_max_ps(__m128 a, __m128 b) _mm_max_ps(__m128 a, __m128 b)
{ {
return __builtin_ia32_maxps(a, b); return __builtin_ia32_maxps(a, b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_and_ps(__m128 a, __m128 b) _mm_and_ps(__m128 a, __m128 b)
{ {
return (__m128)((__v4si)a & (__v4si)b); return (__m128)((__v4si)a & (__v4si)b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_andnot_ps(__m128 a, __m128 b) _mm_andnot_ps(__m128 a, __m128 b)
{ {
return (__m128)(~(__v4si)a & (__v4si)b); return (__m128)(~(__v4si)a & (__v4si)b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_or_ps(__m128 a, __m128 b) _mm_or_ps(__m128 a, __m128 b)
{ {
return (__m128)((__v4si)a | (__v4si)b); return (__m128)((__v4si)a | (__v4si)b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_xor_ps(__m128 a, __m128 b) _mm_xor_ps(__m128 a, __m128 b)
{ {
return (__m128)((__v4si)a ^ (__v4si)b); return (__m128)((__v4si)a ^ (__v4si)b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_ss(__m128 a, __m128 b) _mm_cmpeq_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 0); return (__m128)__builtin_ia32_cmpss(a, b, 0);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_ps(__m128 a, __m128 b) _mm_cmpeq_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 0); return (__m128)__builtin_ia32_cmpps(a, b, 0);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_ss(__m128 a, __m128 b) _mm_cmplt_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 1); return (__m128)__builtin_ia32_cmpss(a, b, 1);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_ps(__m128 a, __m128 b) _mm_cmplt_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 1); return (__m128)__builtin_ia32_cmpps(a, b, 1);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmple_ss(__m128 a, __m128 b) _mm_cmple_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 2); return (__m128)__builtin_ia32_cmpss(a, b, 2);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmple_ps(__m128 a, __m128 b) _mm_cmple_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 2); return (__m128)__builtin_ia32_cmpps(a, b, 2);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_ss(__m128 a, __m128 b) _mm_cmpgt_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(b, a, 1); return (__m128)__builtin_ia32_cmpss(b, a, 1);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_ps(__m128 a, __m128 b) _mm_cmpgt_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(b, a, 1); return (__m128)__builtin_ia32_cmpps(b, a, 1);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_ss(__m128 a, __m128 b) _mm_cmpge_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(b, a, 2); return (__m128)__builtin_ia32_cmpss(b, a, 2);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_ps(__m128 a, __m128 b) _mm_cmpge_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(b, a, 2); return (__m128)__builtin_ia32_cmpps(b, a, 2);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_ss(__m128 a, __m128 b) _mm_cmpneq_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 4); return (__m128)__builtin_ia32_cmpss(a, b, 4);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_ps(__m128 a, __m128 b) _mm_cmpneq_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 4); return (__m128)__builtin_ia32_cmpps(a, b, 4);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_ss(__m128 a, __m128 b) _mm_cmpnlt_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 5); return (__m128)__builtin_ia32_cmpss(a, b, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_ps(__m128 a, __m128 b) _mm_cmpnlt_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 5); return (__m128)__builtin_ia32_cmpps(a, b, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_ss(__m128 a, __m128 b) _mm_cmpnle_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 6); return (__m128)__builtin_ia32_cmpss(a, b, 6);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_ps(__m128 a, __m128 b) _mm_cmpnle_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 6); return (__m128)__builtin_ia32_cmpps(a, b, 6);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_ss(__m128 a, __m128 b) _mm_cmpngt_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(b, a, 5); return (__m128)__builtin_ia32_cmpss(b, a, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_ps(__m128 a, __m128 b) _mm_cmpngt_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(b, a, 5); return (__m128)__builtin_ia32_cmpps(b, a, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_ss(__m128 a, __m128 b) _mm_cmpnge_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(b, a, 6); return (__m128)__builtin_ia32_cmpss(b, a, 6);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_ps(__m128 a, __m128 b) _mm_cmpnge_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(b, a, 6); return (__m128)__builtin_ia32_cmpps(b, a, 6);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_ss(__m128 a, __m128 b) _mm_cmpord_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 7); return (__m128)__builtin_ia32_cmpss(a, b, 7);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_ps(__m128 a, __m128 b) _mm_cmpord_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 7); return (__m128)__builtin_ia32_cmpps(a, b, 7);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_ss(__m128 a, __m128 b) _mm_cmpunord_ss(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpss(a, b, 3); return (__m128)__builtin_ia32_cmpss(a, b, 3);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_ps(__m128 a, __m128 b) _mm_cmpunord_ps(__m128 a, __m128 b)
{ {
return (__m128)__builtin_ia32_cmpps(a, b, 3); return (__m128)__builtin_ia32_cmpps(a, b, 3);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_ss(__m128 a, __m128 b) _mm_comieq_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comieq(a, b); return __builtin_ia32_comieq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_ss(__m128 a, __m128 b) _mm_comilt_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comilt(a, b); return __builtin_ia32_comilt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comile_ss(__m128 a, __m128 b) _mm_comile_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comile(a, b); return __builtin_ia32_comile(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_ss(__m128 a, __m128 b) _mm_comigt_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comigt(a, b); return __builtin_ia32_comigt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comige_ss(__m128 a, __m128 b) _mm_comige_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comige(a, b); return __builtin_ia32_comige(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_ss(__m128 a, __m128 b) _mm_comineq_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_comineq(a, b); return __builtin_ia32_comineq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_ss(__m128 a, __m128 b) _mm_ucomieq_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomieq(a, b); return __builtin_ia32_ucomieq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_ss(__m128 a, __m128 b) _mm_ucomilt_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomilt(a, b); return __builtin_ia32_ucomilt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_ss(__m128 a, __m128 b) _mm_ucomile_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomile(a, b); return __builtin_ia32_ucomile(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_ss(__m128 a, __m128 b) _mm_ucomigt_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomigt(a, b); return __builtin_ia32_ucomigt(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_ss(__m128 a, __m128 b) _mm_ucomige_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomige(a, b); return __builtin_ia32_ucomige(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_ss(__m128 a, __m128 b) _mm_ucomineq_ss(__m128 a, __m128 b)
{ {
return __builtin_ia32_ucomineq(a, b); return __builtin_ia32_ucomineq(a, b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_si32(__m128 a) _mm_cvtss_si32(__m128 a)
{ {
return __builtin_ia32_cvtss2si(a); return __builtin_ia32_cvtss2si(a);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvt_ss2si(__m128 a) _mm_cvt_ss2si(__m128 a)
{ {
return _mm_cvtss_si32(a); return _mm_cvtss_si32(a);
...@@ -402,7 +402,7 @@ _mm_cvt_ss2si(__m128 a) ...@@ -402,7 +402,7 @@ _mm_cvt_ss2si(__m128 a)
#ifdef __x86_64__ #ifdef __x86_64__
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_si64(__m128 a) _mm_cvtss_si64(__m128 a)
{ {
return __builtin_ia32_cvtss2si64(a); return __builtin_ia32_cvtss2si64(a);
...@@ -410,37 +410,37 @@ _mm_cvtss_si64(__m128 a) ...@@ -410,37 +410,37 @@ _mm_cvtss_si64(__m128 a)
#endif #endif
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi32(__m128 a) _mm_cvtps_pi32(__m128 a)
{ {
return (__m64)__builtin_ia32_cvtps2pi(a); return (__m64)__builtin_ia32_cvtps2pi(a);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si32(__m128 a) _mm_cvttss_si32(__m128 a)
{ {
return a[0]; return a[0];
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_cvtt_ss2si(__m128 a) _mm_cvtt_ss2si(__m128 a)
{ {
return _mm_cvttss_si32(a); return _mm_cvttss_si32(a);
} }
static inline long long __attribute__((__always_inline__, __nodebug__)) static __inline__ long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si64(__m128 a) _mm_cvttss_si64(__m128 a)
{ {
return a[0]; return a[0];
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_pi32(__m128 a) _mm_cvttps_pi32(__m128 a)
{ {
return (__m64)__builtin_ia32_cvttps2pi(a); return (__m64)__builtin_ia32_cvttps2pi(a);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_ss(__m128 a, int b) _mm_cvtsi32_ss(__m128 a, int b)
{ {
a[0] = b; a[0] = b;
...@@ -449,7 +449,7 @@ _mm_cvtsi32_ss(__m128 a, int b) ...@@ -449,7 +449,7 @@ _mm_cvtsi32_ss(__m128 a, int b)
#ifdef __x86_64__ #ifdef __x86_64__
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_ss(__m128 a, long long b) _mm_cvtsi64_ss(__m128 a, long long b)
{ {
a[0] = b; a[0] = b;
...@@ -458,19 +458,19 @@ _mm_cvtsi64_ss(__m128 a, long long b) ...@@ -458,19 +458,19 @@ _mm_cvtsi64_ss(__m128 a, long long b)
#endif #endif
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_ps(__m128 a, __m64 b) _mm_cvtpi32_ps(__m128 a, __m64 b)
{ {
return __builtin_ia32_cvtpi2ps(a, (__v2si)b); return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
} }
static inline float __attribute__((__always_inline__, __nodebug__)) static __inline__ float __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_f32(__m128 a) _mm_cvtss_f32(__m128 a)
{ {
return a[0]; return a[0];
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pi(__m128 a, const __m64 *p) _mm_loadh_pi(__m128 a, const __m64 *p)
{ {
__m128 b; __m128 b;
...@@ -479,7 +479,7 @@ _mm_loadh_pi(__m128 a, const __m64 *p) ...@@ -479,7 +479,7 @@ _mm_loadh_pi(__m128 a, const __m64 *p)
return __builtin_shufflevector(a, b, 0, 1, 4, 5); return __builtin_shufflevector(a, b, 0, 1, 4, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pi(__m128 a, const __m64 *p) _mm_loadl_pi(__m128 a, const __m64 *p)
{ {
__m128 b; __m128 b;
...@@ -488,13 +488,13 @@ _mm_loadl_pi(__m128 a, const __m64 *p) ...@@ -488,13 +488,13 @@ _mm_loadl_pi(__m128 a, const __m64 *p)
return __builtin_shufflevector(a, b, 4, 5, 2, 3); return __builtin_shufflevector(a, b, 4, 5, 2, 3);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load_ss(const float *p) _mm_load_ss(const float *p)
{ {
return (__m128){ *p, 0, 0, 0 }; return (__m128){ *p, 0, 0, 0 };
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load1_ps(const float *p) _mm_load1_ps(const float *p)
{ {
return (__m128){ *p, *p, *p, *p }; return (__m128){ *p, *p, *p, *p };
...@@ -502,100 +502,100 @@ _mm_load1_ps(const float *p) ...@@ -502,100 +502,100 @@ _mm_load1_ps(const float *p)
#define _mm_load_ps1(p) _mm_load1_ps(p) #define _mm_load_ps1(p) _mm_load1_ps(p)
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load_ps(const float *p) _mm_load_ps(const float *p)
{ {
return *(__m128*)p; return *(__m128*)p;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadu_ps(const float *p) _mm_loadu_ps(const float *p)
{ {
return __builtin_ia32_loadups(p); return __builtin_ia32_loadups(p);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadr_ps(const float *p) _mm_loadr_ps(const float *p)
{ {
__m128 a = _mm_load_ps(p); __m128 a = _mm_load_ps(p);
return __builtin_shufflevector(a, a, 3, 2, 1, 0); return __builtin_shufflevector(a, a, 3, 2, 1, 0);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ss(float w) _mm_set_ss(float w)
{ {
return (__m128){ w, 0, 0, 0 }; return (__m128){ w, 0, 0, 0 };
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set1_ps(float w) _mm_set1_ps(float w)
{ {
return (__m128){ w, w, w, w }; return (__m128){ w, w, w, w };
} }
// Microsoft specific. // Microsoft specific.
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ps1(float w) _mm_set_ps1(float w)
{ {
return _mm_set1_ps(w); return _mm_set1_ps(w);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ps(float z, float y, float x, float w) _mm_set_ps(float z, float y, float x, float w)
{ {
return (__m128){ w, x, y, z }; return (__m128){ w, x, y, z };
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_setr_ps(float z, float y, float x, float w) _mm_setr_ps(float z, float y, float x, float w)
{ {
return (__m128){ z, y, x, w }; return (__m128){ z, y, x, w };
} }
static inline __m128 __attribute__((__always_inline__)) static __inline__ __m128 __attribute__((__always_inline__))
_mm_setzero_ps(void) _mm_setzero_ps(void)
{ {
return (__m128){ 0, 0, 0, 0 }; return (__m128){ 0, 0, 0, 0 };
} }
static inline void __attribute__((__always_inline__)) static __inline__ void __attribute__((__always_inline__))
_mm_storeh_pi(__m64 *p, __m128 a) _mm_storeh_pi(__m64 *p, __m128 a)
{ {
__builtin_ia32_storehps((__v2si *)p, a); __builtin_ia32_storehps((__v2si *)p, a);
} }
static inline void __attribute__((__always_inline__)) static __inline__ void __attribute__((__always_inline__))
_mm_storel_pi(__m64 *p, __m128 a) _mm_storel_pi(__m64 *p, __m128 a)
{ {
__builtin_ia32_storelps((__v2si *)p, a); __builtin_ia32_storelps((__v2si *)p, a);
} }
static inline void __attribute__((__always_inline__)) static __inline__ void __attribute__((__always_inline__))
_mm_store_ss(float *p, __m128 a) _mm_store_ss(float *p, __m128 a)
{ {
*p = a[0]; *p = a[0];
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_ps(float *p, __m128 a) _mm_storeu_ps(float *p, __m128 a)
{ {
__builtin_ia32_storeups(p, a); __builtin_ia32_storeups(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store1_ps(float *p, __m128 a) _mm_store1_ps(float *p, __m128 a)
{ {
a = __builtin_shufflevector(a, a, 0, 0, 0, 0); a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
_mm_storeu_ps(p, a); _mm_storeu_ps(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_store_ps(float *p, __m128 a) _mm_store_ps(float *p, __m128 a)
{ {
*(__m128 *)p = a; *(__m128 *)p = a;
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_storer_ps(float *p, __m128 a) _mm_storer_ps(float *p, __m128 a)
{ {
a = __builtin_shufflevector(a, a, 3, 2, 1, 0); a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
...@@ -612,32 +612,32 @@ _mm_storer_ps(float *p, __m128 a) ...@@ -612,32 +612,32 @@ _mm_storer_ps(float *p, __m128 a)
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pi(__m64 *p, __m64 a) _mm_stream_pi(__m64 *p, __m64 a)
{ {
__builtin_ia32_movntq(p, a); __builtin_ia32_movntq(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_stream_ps(float *p, __m128 a) _mm_stream_ps(float *p, __m128 a)
{ {
__builtin_ia32_movntps(p, a); __builtin_ia32_movntps(p, a);
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_sfence(void) _mm_sfence(void)
{ {
__builtin_ia32_sfence(); __builtin_ia32_sfence();
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_extract_pi16(__m64 a, int n) _mm_extract_pi16(__m64 a, int n)
{ {
__v4hi b = (__v4hi)a; __v4hi b = (__v4hi)a;
return (unsigned short)b[n & 3]; return (unsigned short)b[n & 3];
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_insert_pi16(__m64 a, int d, int n) _mm_insert_pi16(__m64 a, int d, int n)
{ {
__v4hi b = (__v4hi)a; __v4hi b = (__v4hi)a;
...@@ -645,37 +645,37 @@ _mm_insert_pi16(__m64 a, int d, int n) ...@@ -645,37 +645,37 @@ _mm_insert_pi16(__m64 a, int d, int n)
return (__m64)b; return (__m64)b;
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_max_pi16(__m64 a, __m64 b) _mm_max_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_max_pu8(__m64 a, __m64 b) _mm_max_pu8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_min_pi16(__m64 a, __m64 b) _mm_min_pi16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_min_pu8(__m64 a, __m64 b) _mm_min_pu8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pi8(__m64 a) _mm_movemask_pi8(__m64 a)
{ {
return __builtin_ia32_pmovmskb((__v8qi)a); return __builtin_ia32_pmovmskb((__v8qi)a);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_pu16(__m64 a, __m64 b) _mm_mulhi_pu16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
...@@ -686,37 +686,37 @@ _mm_mulhi_pu16(__m64 a, __m64 b) ...@@ -686,37 +686,37 @@ _mm_mulhi_pu16(__m64 a, __m64 b)
(n) & 0x3, ((n) & 0xc) >> 2, \ (n) & 0x3, ((n) & 0xc) >> 2, \
((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6))
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_maskmove_si64(__m64 d, __m64 n, char *p) _mm_maskmove_si64(__m64 d, __m64 n, char *p)
{ {
__builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_avg_pu8(__m64 a, __m64 b) _mm_avg_pu8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_avg_pu16(__m64 a, __m64 b) _mm_avg_pu16(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sad_pu8(__m64 a, __m64 b) _mm_sad_pu8(__m64 a, __m64 b)
{ {
return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
} }
static inline unsigned int __attribute__((__always_inline__, __nodebug__)) static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
_mm_getcsr(void) _mm_getcsr(void)
{ {
return __builtin_ia32_stmxcsr(); return __builtin_ia32_stmxcsr();
} }
static inline void __attribute__((__always_inline__, __nodebug__)) static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_setcsr(unsigned int i) _mm_setcsr(unsigned int i)
{ {
__builtin_ia32_ldmxcsr(i); __builtin_ia32_ldmxcsr(i);
...@@ -727,37 +727,37 @@ _mm_setcsr(unsigned int i) ...@@ -727,37 +727,37 @@ _mm_setcsr(unsigned int i)
(((mask) & 0x30) >> 4) + 4, \ (((mask) & 0x30) >> 4) + 4, \
(((mask) & 0xc0) >> 6) + 4)) (((mask) & 0xc0) >> 6) + 4))
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_ps(__m128 a, __m128 b) _mm_unpackhi_ps(__m128 a, __m128 b)
{ {
return __builtin_shufflevector(a, b, 2, 6, 3, 7); return __builtin_shufflevector(a, b, 2, 6, 3, 7);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_ps(__m128 a, __m128 b) _mm_unpacklo_ps(__m128 a, __m128 b)
{ {
return __builtin_shufflevector(a, b, 0, 4, 1, 5); return __builtin_shufflevector(a, b, 0, 4, 1, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_move_ss(__m128 a, __m128 b) _mm_move_ss(__m128 a, __m128 b)
{ {
return __builtin_shufflevector(a, b, 4, 1, 2, 3); return __builtin_shufflevector(a, b, 4, 1, 2, 3);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_movehl_ps(__m128 a, __m128 b) _mm_movehl_ps(__m128 a, __m128 b)
{ {
return __builtin_shufflevector(a, b, 6, 7, 2, 3); return __builtin_shufflevector(a, b, 6, 7, 2, 3);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_movelh_ps(__m128 a, __m128 b) _mm_movelh_ps(__m128 a, __m128 b)
{ {
return __builtin_shufflevector(a, b, 0, 1, 4, 5); return __builtin_shufflevector(a, b, 0, 1, 4, 5);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi16_ps(__m64 a) _mm_cvtpi16_ps(__m64 a)
{ {
__m64 b, c; __m64 b, c;
...@@ -775,7 +775,7 @@ _mm_cvtpi16_ps(__m64 a) ...@@ -775,7 +775,7 @@ _mm_cvtpi16_ps(__m64 a)
return r; return r;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpu16_ps(__m64 a) _mm_cvtpu16_ps(__m64 a)
{ {
__m64 b, c; __m64 b, c;
...@@ -792,7 +792,7 @@ _mm_cvtpu16_ps(__m64 a) ...@@ -792,7 +792,7 @@ _mm_cvtpu16_ps(__m64 a)
return r; return r;
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi8_ps(__m64 a) _mm_cvtpi8_ps(__m64 a)
{ {
__m64 b; __m64 b;
...@@ -804,7 +804,7 @@ _mm_cvtpi8_ps(__m64 a) ...@@ -804,7 +804,7 @@ _mm_cvtpi8_ps(__m64 a)
return _mm_cvtpi16_ps(b); return _mm_cvtpi16_ps(b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpu8_ps(__m64 a) _mm_cvtpu8_ps(__m64 a)
{ {
__m64 b; __m64 b;
...@@ -815,7 +815,7 @@ _mm_cvtpu8_ps(__m64 a) ...@@ -815,7 +815,7 @@ _mm_cvtpu8_ps(__m64 a)
return _mm_cvtpi16_ps(b); return _mm_cvtpi16_ps(b);
} }
static inline __m128 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32x2_ps(__m64 a, __m64 b) _mm_cvtpi32x2_ps(__m64 a, __m64 b)
{ {
__m128 c; __m128 c;
...@@ -827,7 +827,7 @@ _mm_cvtpi32x2_ps(__m64 a, __m64 b) ...@@ -827,7 +827,7 @@ _mm_cvtpi32x2_ps(__m64 a, __m64 b)
return _mm_cvtpi32_ps(c, a); return _mm_cvtpi32_ps(c, a);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi16(__m128 a) _mm_cvtps_pi16(__m128 a)
{ {
__m64 b, c; __m64 b, c;
...@@ -839,7 +839,7 @@ _mm_cvtps_pi16(__m128 a) ...@@ -839,7 +839,7 @@ _mm_cvtps_pi16(__m128 a)
return _mm_packs_pi16(b, c); return _mm_packs_pi16(b, c);
} }
static inline __m64 __attribute__((__always_inline__, __nodebug__)) static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi8(__m128 a) _mm_cvtps_pi8(__m128 a)
{ {
__m64 b, c; __m64 b, c;
...@@ -850,7 +850,7 @@ _mm_cvtps_pi8(__m128 a) ...@@ -850,7 +850,7 @@ _mm_cvtps_pi8(__m128 a)
return _mm_packs_pi16(b, c); return _mm_packs_pi16(b, c);
} }
static inline int __attribute__((__always_inline__, __nodebug__)) static __inline__ int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_ps(__m128 a) _mm_movemask_ps(__m128 a)
{ {
return __builtin_ia32_movmskps(a); return __builtin_ia32_movmskps(a);
......
// RUN: %clang -triple=i386-apple-darwin10 -fsyntax-only -verify -std=c89 %s
// PR6658
#include <xmmintrin.h>
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment