diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h index 126000b..9bc721b 100644 --- a/include/bx/float4_ni.h +++ b/include/bx/float4_ni.h @@ -194,16 +194,11 @@ namespace bx namespace float4_logexp_detail { - BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b) - { - return float4_splat(_b); - } - BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c) { const float4_t bbbb = float4_splat(_b); - const float4_t poly0 = float4_poly0(_a, _c); - const float4_t result = float4_madd(poly0, _a, bbbb); + const float4_t cccc = float4_splat(_c); + const float4_t result = float4_madd(cccc, _a, bbbb); return result; } diff --git a/include/bx/float4_ref.h b/include/bx/float4_ref.h index 47c4184..42f9f0b 100644 --- a/include/bx/float4_ref.h +++ b/include/bx/float4_ref.h @@ -182,22 +182,37 @@ IMPLEMENT_TEST(xyzw , 0xf); BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr) { - return *reinterpret_cast(_ptr); + const uint32_t* input = reinterpret_cast(_ptr); + float4_t result; + result.uxyzw[0] = input[0]; + result.uxyzw[1] = input[1]; + result.uxyzw[2] = input[2]; + result.uxyzw[3] = input[3]; + return result; } BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a) { - *reinterpret_cast(_ptr) = _a; + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; } BX_FLOAT4_INLINE void float4_stx(void* _ptr, float4_t _a) { - *reinterpret_cast(_ptr) = _a.uxyzw[0]; + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; } BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a) { - *reinterpret_cast(_ptr) = _a; + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; } BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) @@ -222,8 +237,13 @@ IMPLEMENT_TEST(xyzw , 0xf); BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr) { - float val = *reinterpret_cast(_ptr); - return float4_ld(val, val, val, val); + const uint32_t val = *reinterpret_cast(_ptr); + float4_t result; + result.uxyzw[0] = val; + result.uxyzw[1] = val; + result.uxyzw[2] = val; + result.uxyzw[3] = val; + return result; } BX_FLOAT4_INLINE float4_t float4_splat(float _a) diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h index 9e9bc30..745a780 100644 --- a/include/bx/float4_t.h +++ b/include/bx/float4_t.h @@ -15,6 +15,7 @@ #elif 0 // __ARM_NEON__ # include "float4_neon.h" #else +# pragma message("************************************\nUsing SIMD reference implementation!\n************************************") # include "float4_ref.h" #endif // diff --git a/premake/toolchain.lua b/premake/toolchain.lua index f9fc714..1c44499 100755 --- a/premake/toolchain.lua +++ b/premake/toolchain.lua @@ -183,6 +183,9 @@ function toolchain(_buildDir, _libDir) targetsuffix "Release" configuration { "vs*" } + flags { + "EnableSSE2", + } includedirs { bxDir .. "include/compat/msvc" } defines { "WIN32",