diff --git a/include/bx/inline/simd128_langext.inl b/include/bx/inline/simd128_langext.inl index d48e553..a0a66af 100644 --- a/include/bx/inline/simd128_langext.inl +++ b/include/bx/inline/simd128_langext.inl @@ -346,6 +346,14 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf); return result; } + template<> + BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpneq(simd128_langext_t _a, simd128_langext_t _b) + { + simd128_langext_t result; + result.vi = _a.vf != _b.vf; + return result; + } + template<> BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmplt(simd128_langext_t _a, simd128_langext_t _b) { diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl index 5dfce0d..dd1ebb0 100644 --- a/include/bx/inline/simd128_neon.inl +++ b/include/bx/inline/simd128_neon.inl @@ -9,23 +9,15 @@ namespace bx { + #if BX_COMPILER_CLANG - -#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \ -__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 ) -#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \ -__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 ) - +# define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 ) +# define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 ) #else - -#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \ -__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 }) -#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \ -__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 }) - +# define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 }) +# define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 }) #endif - #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 @@ -291,11 +283,17 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); return result; } + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpneq(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_cmpneq_ni(_a, _b); + } + template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b) { - const uint32x4_t tmp = vcltq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + const uint32x4_t tmp = vcltq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); return result; } @@ -303,8 +301,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b) { - const uint32x4_t tmp = vcleq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + const uint32x4_t tmp = vcleq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); return result; } @@ -312,8 +310,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b) { - const uint32x4_t tmp = vcgtq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + const uint32x4_t tmp = vcgtq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); return result; } @@ -321,8 +319,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); template<> BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b) { - const uint32x4_t tmp = vcgeq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + const uint32x4_t tmp = vcgeq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); return result; } diff --git a/include/bx/inline/simd128_ref.inl b/include/bx/inline/simd128_ref.inl index b99976e..7b91af5 100644 --- a/include/bx/inline/simd128_ref.inl +++ b/include/bx/inline/simd128_ref.inl @@ -396,6 +396,17 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf); return result; } + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpneq(simd128_ref_t _a, simd128_ref_t _b) + { + simd128_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] != _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] != _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] != _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] != _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + template<> BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b) { diff --git a/include/bx/inline/simd128_sse.inl b/include/bx/inline/simd128_sse.inl index 2fe09e3..79f185b 100644 --- a/include/bx/inline/simd128_sse.inl +++ b/include/bx/inline/simd128_sse.inl @@ -308,6 +308,12 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf); return _mm_cmpeq_ps(_a, _b); } + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpneq(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpneq_ps(_a, _b); + } + template<> BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b) { diff --git a/include/bx/inline/simd_ni.inl b/include/bx/inline/simd_ni.inl index 69d5110..499e54f 100644 --- a/include/bx/inline/simd_ni.inl +++ b/include/bx/inline/simd_ni.inl @@ -124,6 +124,15 @@ namespace bx return result; } + template + BX_SIMD_INLINE Ty simd_cmpneq_ni(Ty _a, Ty _b) + { + const Ty tmp0 = simd_cmpeq(_a, _b); + const Ty result = simd_not(tmp0); + + return result; + } + template BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b) { diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h index 0b07567..5e5fefa 100644 --- a/include/bx/simd_t.h +++ b/include/bx/simd_t.h @@ -196,6 +196,9 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw); template Ty simd_cmpeq(Ty _a, Ty _b); + template + Ty simd_cmpneq(Ty _a, Ty _b); + template Ty simd_cmplt(Ty _a, Ty _b);