diff --git a/include/bx/inline/simd128_langext.inl b/include/bx/inline/simd128_langext.inl
index d48e553..a0a66af 100644
--- a/include/bx/inline/simd128_langext.inl
+++ b/include/bx/inline/simd128_langext.inl
@@ -346,6 +346,14 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpneq(simd128_langext_t _a, simd128_langext_t _b)
+	{
+		simd128_langext_t result;
+		result.vi = _a.vf != _b.vf;
+		return result;
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmplt(simd128_langext_t _a, simd128_langext_t _b)
 	{
diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl
index 5dfce0d..dd1ebb0 100644
--- a/include/bx/inline/simd128_neon.inl
+++ b/include/bx/inline/simd128_neon.inl
@@ -9,23 +9,15 @@
 
 namespace bx
 {
+
 #if BX_COMPILER_CLANG
-
-#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
-__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
-#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
-__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
-	
+#	define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)     __builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
+#	define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
 #else
-
-#define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)	\
-__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
-#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3)	\
-__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
-
+#	define SHUFFLE_A(_a,  _i0, _i1, _i2, _i3)     __builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
+#	define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
 #endif
 
-	
 #define ELEMx 0
 #define ELEMy 1
 #define ELEMz 2
@@ -291,11 +283,17 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpneq(simd128_neon_t _a, simd128_neon_t _b)
+	{
+		return simd_cmpneq_ni(_a, _b);
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcltq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcltq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -303,8 +301,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcleq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcleq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -312,8 +310,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcgtq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcgtq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
@@ -321,8 +319,8 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b)
 	{
-		const uint32x4_t tmp    = vcgeq_f32(_a, _b);
-		const simd128_neon_t   result = vreinterpretq_f32_u32(tmp);
+		const uint32x4_t tmp        = vcgeq_f32(_a, _b);
+		const simd128_neon_t result = vreinterpretq_f32_u32(tmp);
 
 		return result;
 	}
diff --git a/include/bx/inline/simd128_ref.inl b/include/bx/inline/simd128_ref.inl
index b99976e..7b91af5 100644
--- a/include/bx/inline/simd128_ref.inl
+++ b/include/bx/inline/simd128_ref.inl
@@ -396,6 +396,17 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return result;
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpneq(simd128_ref_t _a, simd128_ref_t _b)
+	{
+		simd128_ref_t result;
+		result.ixyzw[0] = _a.fxyzw[0] != _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] != _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] != _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] != _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b)
 	{
diff --git a/include/bx/inline/simd128_sse.inl b/include/bx/inline/simd128_sse.inl
index 2fe09e3..79f185b 100644
--- a/include/bx/inline/simd128_sse.inl
+++ b/include/bx/inline/simd128_sse.inl
@@ -308,6 +308,12 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf);
 		return _mm_cmpeq_ps(_a, _b);
 	}
 
+	template<>
+	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpneq(simd128_sse_t _a, simd128_sse_t _b)
+	{
+		return _mm_cmpneq_ps(_a, _b);
+	}
+
 	template<>
 	BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b)
 	{
diff --git a/include/bx/inline/simd_ni.inl b/include/bx/inline/simd_ni.inl
index 69d5110..499e54f 100644
--- a/include/bx/inline/simd_ni.inl
+++ b/include/bx/inline/simd_ni.inl
@@ -124,6 +124,15 @@ namespace bx
 		return result;
 	}
 
+	template<typename Ty>
+	BX_SIMD_INLINE Ty simd_cmpneq_ni(Ty _a, Ty _b)
+	{
+		const Ty tmp0   = simd_cmpeq(_a, _b);
+		const Ty result = simd_not(tmp0);
+
+		return result;
+	}
+
 	template<typename Ty>
 	BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b)
 	{
diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h
index 0b07567..5e5fefa 100644
--- a/include/bx/simd_t.h
+++ b/include/bx/simd_t.h
@@ -196,6 +196,9 @@ BX_SIMD128_IMPLEMENT_TEST(xyzw);
 	template<typename Ty>
 	Ty simd_cmpeq(Ty _a, Ty _b);
 
+	template<typename Ty>
+	Ty simd_cmpneq(Ty _a, Ty _b);
+
 	template<typename Ty>
 	Ty simd_cmplt(Ty _a, Ty _b);