diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h
index 126000b..9bc721b 100644
--- a/include/bx/float4_ni.h
+++ b/include/bx/float4_ni.h
@@ -194,16 +194,11 @@ namespace bx
 
 	namespace float4_logexp_detail
 	{
-		BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b)
-		{
-			return float4_splat(_b);
-		}
-
 		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
 		{
 			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly0  = float4_poly0(_a, _c);
-			const float4_t result = float4_madd(poly0, _a, bbbb);
+			const float4_t cccc   = float4_splat(_c);
+			const float4_t result = float4_madd(cccc, _a, bbbb);
 
 			return result;
 		}
diff --git a/include/bx/float4_ref.h b/include/bx/float4_ref.h
index 47c4184..42f9f0b 100644
--- a/include/bx/float4_ref.h
+++ b/include/bx/float4_ref.h
@@ -182,22 +182,37 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
 	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
 	{
-		return *reinterpret_cast<const float4_t*>(_ptr);
+		const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
+		float4_t result;
+		result.uxyzw[0] = input[0];
+		result.uxyzw[1] = input[1];
+		result.uxyzw[2] = input[2];
+		result.uxyzw[3] = input[3];
+		return result;
 	}
 
 	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
 	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
+		result[1] = _a.uxyzw[1];
+		result[2] = _a.uxyzw[2];
+		result[3] = _a.uxyzw[3];
 	}
 
 	BX_FLOAT4_INLINE void float4_stx(void* _ptr, float4_t _a)
 	{
-		*reinterpret_cast<uint32_t*>(_ptr) = _a.uxyzw[0];
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
 	}
 
 	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
 	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
+		uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
+		result[0] = _a.uxyzw[0];
+		result[1] = _a.uxyzw[1];
+		result[2] = _a.uxyzw[2];
+		result[3] = _a.uxyzw[3];
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
@@ -222,8 +237,13 @@ IMPLEMENT_TEST(xyzw , 0xf);
 
 	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
 	{
-		float val = *reinterpret_cast<const float*>(_ptr);
-		return float4_ld(val, val, val, val);
+		const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
+		float4_t result;
+		result.uxyzw[0] = val;
+		result.uxyzw[1] = val;
+		result.uxyzw[2] = val;
+		result.uxyzw[3] = val;
+		return result;
 	}
 
 	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h
index 9e9bc30..745a780 100644
--- a/include/bx/float4_t.h
+++ b/include/bx/float4_t.h
@@ -15,6 +15,7 @@
 #elif 0 // __ARM_NEON__
 #	include "float4_neon.h"
 #else
+#	pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
 #	include "float4_ref.h"
 #endif //
 
diff --git a/premake/toolchain.lua b/premake/toolchain.lua
index f9fc714..1c44499 100755
--- a/premake/toolchain.lua
+++ b/premake/toolchain.lua
@@ -183,6 +183,9 @@ function toolchain(_buildDir, _libDir)
 		targetsuffix "Release"
 
 	configuration { "vs*" }
+		flags {
+			"EnableSSE2",
+		}
 		includedirs { bxDir .. "include/compat/msvc" }
 		defines {
 			"WIN32",