From ead3450ec77c46a2f32365ffaf3742fbc653e414 Mon Sep 17 00:00:00 2001
From: Attila Kocsis <kocsisa78@gmail.com>
Date: Sun, 19 Jan 2020 12:57:19 +0100
Subject: [PATCH] arm_neon is enabled on clang compiler

---
 include/bx/inline/simd128_neon.inl | 46 ++++++++++++++++++++++--------
 include/bx/simd_t.h                |  2 +-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl
index 7d0c0d8..5dfce0d 100644
--- a/include/bx/inline/simd128_neon.inl
+++ b/include/bx/inline/simd128_neon.inl
@@ -9,6 +9,23 @@
 
 namespace bx
 {
+#if BX_COMPILER_CLANG
+
+#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
+__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
+#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
+__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
+	
+#else
+
+#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
+__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
+#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
+__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
+
+#endif
+
+	
 #define ELEMx 0
 #define ELEMy 1
 #define ELEMz 2
@@ -17,7 +34,7 @@ namespace bx
 			template<>                                                                                \
 			BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a)         \
 			{                                                                                         \
-				return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
+				return SHUFFLE_A(_a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w ); \
 			}
 
 #include "simd128_swizzle.inl"
@@ -74,50 +91,52 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
+		return SHUFFLE_AB(_a, _b, 0, 1, 4, 5 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
+		return SHUFFLE_AB(_a, _b, 4, 5, 0, 1 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
+		return SHUFFLE_AB(_a, _b, 6, 7, 2, 3 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
+		return SHUFFLE_AB(_a, _b, 2, 3, 6, 7 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
+		return SHUFFLE_AB(_a, _b, 0, 4, 1, 5 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
+		return SHUFFLE_AB(_a, _b, 4, 0, 5, 1 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
+		return SHUFFLE_AB(_a, _b, 2, 6, 3, 7 );
 	}
 
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
+		return SHUFFLE_AB(_a, _b, 6, 2, 7, 3 );
 	}
+#undef SHUFFLE_A
+#undef SHUFFLE_AB
 
 	template<>
 	BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a)
@@ -367,6 +386,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count)
 	{
+#if !BX_COMPILER_CLANG
 		if (__builtin_constant_p(_count) )
 		{
 			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
@@ -375,7 +395,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 
 			return result;
 		}
-
+#endif
 		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
 		const int32x4_t  shift  = vdupq_n_s32(_count);
 		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
@@ -387,6 +407,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count)
 	{
+#if !BX_COMPILER_CLANG
 		if (__builtin_constant_p(_count) )
 		{
 			const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
@@ -395,7 +416,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 
 			return result;
 		}
-
+#endif
 		const uint32x4_t tmp0   = vreinterpretq_u32_f32(_a);
 		const int32x4_t  shift  = vdupq_n_s32(-_count);
 		const uint32x4_t tmp1   = vshlq_u32(tmp0, shift);
@@ -407,6 +428,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count)
 	{
+#if !BX_COMPILER_CLANG
 		if (__builtin_constant_p(_count) )
 		{
 			const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
@@ -415,7 +437,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 
 			return result;
 		}
-
+#endif
 		const int32x4_t tmp0   = vreinterpretq_s32_f32(_a);
 		const int32x4_t shift  = vdupq_n_s32(-_count);
 		const int32x4_t tmp1   = vshlq_s32(tmp0, shift);
diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h
index c5e048c..0b07567 100644
--- a/include/bx/simd_t.h
+++ b/include/bx/simd_t.h
@@ -32,7 +32,7 @@
 #	include <xmmintrin.h> // __m128
 #	undef  BX_SIMD_SSE
 #	define BX_SIMD_SSE 1
-#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG
+#elif defined(__ARM_NEON__) && (!BX_COMPILER_CLANG || BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) )
 #	include <arm_neon.h>
 #	undef  BX_SIMD_NEON
 #	define BX_SIMD_NEON 1