From ead3450ec77c46a2f32365ffaf3742fbc653e414 Mon Sep 17 00:00:00 2001 From: Attila Kocsis Date: Sun, 19 Jan 2020 12:57:19 +0100 Subject: [PATCH] arm_neon is enabled on clang compiler --- include/bx/inline/simd128_neon.inl | 46 ++++++++++++++++++++++-------- include/bx/simd_t.h | 2 +- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl index 7d0c0d8..5dfce0d 100644 --- a/include/bx/inline/simd128_neon.inl +++ b/include/bx/inline/simd128_neon.inl @@ -9,6 +9,23 @@ namespace bx { +#if BX_COMPILER_CLANG + +#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \ +__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 ) +#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \ +__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 ) + +#else + +#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \ +__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 }) +#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \ +__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 }) + +#endif + + #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 @@ -17,7 +34,7 @@ namespace bx template<> \ BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \ { \ - return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ + return SHUFFLE_A(_a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w ); \ } #include "simd128_swizzle.inl" @@ -74,50 +91,52 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); + return SHUFFLE_AB(_a, _b, 0, 1, 4, 5 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); + return SHUFFLE_AB(_a, _b, 4, 5, 0, 1 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); + return SHUFFLE_AB(_a, _b, 6, 7, 2, 3 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); + return SHUFFLE_AB(_a, _b, 2, 3, 6, 7 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); + return SHUFFLE_AB(_a, _b, 0, 4, 1, 5 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); + return SHUFFLE_AB(_a, _b, 4, 0, 5, 1 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); + return SHUFFLE_AB(_a, _b, 2, 6, 3, 7 ); } template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b) { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); + return SHUFFLE_AB(_a, _b, 6, 2, 7, 3 ); } +#undef SHUFFLE_A +#undef SHUFFLE_AB template<> BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a) @@ -367,6 +386,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count) { +#if !BX_COMPILER_CLANG if (__builtin_constant_p(_count) ) { const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); @@ -375,7 +395,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); return result; } - +#endif const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); const int32x4_t shift = vdupq_n_s32(_count); const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); @@ -387,6 +407,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count) { +#if !BX_COMPILER_CLANG if (__builtin_constant_p(_count) ) { const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); @@ -395,7 +416,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); return result; } - +#endif const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); const int32x4_t shift = vdupq_n_s32(-_count); const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); @@ -407,6 +428,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count) { +#if !BX_COMPILER_CLANG if (__builtin_constant_p(_count) ) { const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); @@ -415,7 +437,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); return result; } - +#endif const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); const int32x4_t shift = vdupq_n_s32(-_count); const int32x4_t tmp1 = vshlq_s32(tmp0, shift); diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h index c5e048c..0b07567 100644 --- a/include/bx/simd_t.h +++ b/include/bx/simd_t.h @@ -32,7 +32,7 @@ # include // __m128 # undef BX_SIMD_SSE # define BX_SIMD_SSE 1 -#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG +#elif defined(__ARM_NEON__) && (!BX_COMPILER_CLANG || BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) ) # include # undef BX_SIMD_NEON # define BX_SIMD_NEON 1