From 50516d4312063d2199e058352f5d339a2a775a6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Branimir=20Karad=C5=BEi=C4=87?= Date: Sun, 17 Jul 2016 10:48:22 -0700 Subject: [PATCH] Refactoring SIMD code. --- include/bx/float4_langext.h | 910 ++++++++++++++------------- include/bx/float4_neon.h | 964 ++++++++++++++-------------- include/bx/float4_ni.h | 503 ++++++++------- include/bx/float4_ref.h | 1172 ++++++++++++++++++----------------- include/bx/float4_sse.h | 446 +++++++++---- include/bx/float4_t.h | 387 +++++++++++- include/bx/macros.h | 4 +- 7 files changed, 2559 insertions(+), 1827 deletions(-) diff --git a/include/bx/float4_langext.h b/include/bx/float4_langext.h index c5c3ddd..2214df6 100644 --- a/include/bx/float4_langext.h +++ b/include/bx/float4_langext.h @@ -6,446 +6,6 @@ #ifndef BX_FLOAT4_LANGEXT_H_HEADER_GUARD #define BX_FLOAT4_LANGEXT_H_HEADER_GUARD -#include - -namespace bx -{ - typedef union float4_t - { - float __attribute__((vector_size(16))) vf; - int32_t __attribute__((vector_size(16))) vi; - uint32_t __attribute__((vector_size(16))) vu; - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - - } float4_t; - -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ - { \ - float4_t result; \ - result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ - return result; \ - } - -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw, _mask) \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return 0 != (tmp&(_mask) ); \ - } \ - \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return (_mask) == (tmp&(_mask) ); \ - } - -IMPLEMENT_TEST(x , 0x1); -IMPLEMENT_TEST(y , 0x2); -IMPLEMENT_TEST(xy , 0x3); -IMPLEMENT_TEST(z , 0x4); -IMPLEMENT_TEST(xz , 0x5); -IMPLEMENT_TEST(yz , 0x6); -IMPLEMENT_TEST(xyz , 0x7); -IMPLEMENT_TEST(w , 0x8); -IMPLEMENT_TEST(xw , 0x9); -IMPLEMENT_TEST(yw , 0xa); -IMPLEMENT_TEST(xyw , 0xb); -IMPLEMENT_TEST(zw , 0xc); -IMPLEMENT_TEST(xzw , 0xd); -IMPLEMENT_TEST(yzw , 0xe); -IMPLEMENT_TEST(xyzw , 0xf); - -#undef IMPLEMENT_TEST - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 5, 7, 2, 3); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 5, 7); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAzC(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBwD(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7); - return result; - } - - BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) - { - return _a.fxyzw[0]; - } - - BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) - { - return _a.fxyzw[1]; - } - - BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) - { - return _a.fxyzw[2]; - } - - BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) - { - return _a.fxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) - { - const uint32_t* input = reinterpret_cast(_ptr); - float4_t result; - result.uxyzw[0] = input[0]; - result.uxyzw[1] = input[1]; - result.uxyzw[2] = input[2]; - result.uxyzw[3] = input[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - } - - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) - { - float4_t result; - result.vf = { _x, _y, _z, _w }; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - float4_t result; - result.vu = { _x, _y, _z, _w }; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) - { - const uint32_t val = *reinterpret_cast(_ptr); - float4_t result; - result.vu = { val, val, val, val }; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) - { - return float4_ld(_a, _a, _a, _a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) - { - return float4_ild(_a, _a, _a, _a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_zero() - { - return float4_ild(0, 0, 0, 0); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) - { - float4_t result; - result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) ); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) - { - float4_t result; - result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) ); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a) - { - const float4_t tmp = float4_ftoi(_a); - const float4_t result = float4_itof(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = _a.vf + _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = _a.vf - _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = _a.vf * _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b) - { - float4_t result; - result.vf = _a.vf / _b.vf; - return result; - } - -#if 0 - BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a) - { - float4_t result; - const float4_t one = float4_splat(1.0f); - result.vf = one / _a.vf; - return result; - } -#endif // 0 - - BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a) - { - float4_t result; - result.vf[0] = sqrtf(_a.vf[0]); - result.vf[1] = sqrtf(_a.vf[1]); - result.vf[2] = sqrtf(_a.vf[2]); - result.vf[3] = sqrtf(_a.vf[3]); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) - { - float4_t result; - result.vf[0] = 1.0f / sqrtf(_a.vf[0]); - result.vf[1] = 1.0f / sqrtf(_a.vf[1]); - result.vf[2] = 1.0f / sqrtf(_a.vf[2]); - result.vf[3] = 1.0f / sqrtf(_a.vf[3]); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vf == _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vf < _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vf <= _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vf > _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vf >= _b.vf; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) - { - float4_t result; - result.vu = _a.vu & _b.vu; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) - { - float4_t result; - result.vu = _a.vu & ~_b.vu; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) - { - float4_t result; - result.vu = _a.vu | _b.vu; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) - { - float4_t result; - result.vu = _a.vu ^ _b.vu; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count) - { - float4_t result; - const float4_t count = float4_isplat(_count); - result.vu = _a.vu << count.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count) - { - float4_t result; - const float4_t count = float4_isplat(_count); - result.vu = _a.vu >> count.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count) - { - float4_t result; - const float4_t count = float4_isplat(_count); - result.vi = _a.vi >> count.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vi == _b.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vi < _b.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vi > _b.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vi + _b.vi; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) - { - float4_t result; - result.vi = _a.vi - _b.vi; - return result; - } - -} // namespace bx - #define float4_rcp float4_rcp_ni #define float4_orx float4_orx_ni #define float4_orc float4_orc_ni @@ -479,4 +39,474 @@ IMPLEMENT_TEST(xyzw , 0xf); #define float4_imax float4_imax_ni #include "float4_ni.h" +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ + { \ + float4_t result; \ + result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ + return result; \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | ( _test.uxyzw[0]>>31) \ + ; \ + return 0 != (tmp&(_mask) ); \ + } \ + \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | ( _test.uxyzw[0]>>31) \ + ; \ + return (_mask) == (tmp&(_mask) ); \ + } + +IMPLEMENT_TEST(x , 0x1); +IMPLEMENT_TEST(y , 0x2); +IMPLEMENT_TEST(xy , 0x3); +IMPLEMENT_TEST(z , 0x4); +IMPLEMENT_TEST(xz , 0x5); +IMPLEMENT_TEST(yz , 0x6); +IMPLEMENT_TEST(xyz , 0x7); +IMPLEMENT_TEST(w , 0x8); +IMPLEMENT_TEST(xw , 0x9); +IMPLEMENT_TEST(yw , 0xa); +IMPLEMENT_TEST(xyw , 0xb); +IMPLEMENT_TEST(zw , 0xc); +IMPLEMENT_TEST(xzw , 0xd); +IMPLEMENT_TEST(yzw , 0xe); +IMPLEMENT_TEST(xyzw , 0xf); + +#undef IMPLEMENT_TEST + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 5, 7, 2, 3); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 5, 7); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAzC(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBwD(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) + { + return _a.fxyzw[0]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) + { + return _a.fxyzw[1]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) + { + return _a.fxyzw[2]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) + { + return _a.fxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) + { + const uint32_t* input = reinterpret_cast(_ptr); + float4_t result; + result.uxyzw[0] = input[0]; + result.uxyzw[1] = input[1]; + result.uxyzw[2] = input[2]; + result.uxyzw[3] = input[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + { + float4_t result; + result.vf = { _x, _y, _z, _w }; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + float4_t result; + result.vu = { _x, _y, _z, _w }; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) + { + const uint32_t val = *reinterpret_cast(_ptr); + float4_t result; + result.vu = { val, val, val, val }; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) + { + return float4_ld(_a, _a, _a, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) + { + return float4_ild(_a, _a, _a, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_zero() + { + return float4_ild(0, 0, 0, 0); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) + { + float4_t result; + result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) ); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) + { + float4_t result; + result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) ); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a) + { + const float4_t tmp = float4_ftoi(_a); + const float4_t result = float4_itof(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = _a.vf + _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = _a.vf - _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = _a.vf * _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b) + { + float4_t result; + result.vf = _a.vf / _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a) + { + float4_t result; + result.vf[0] = sqrtf(_a.vf[0]); + result.vf[1] = sqrtf(_a.vf[1]); + result.vf[2] = sqrtf(_a.vf[2]); + result.vf[3] = sqrtf(_a.vf[3]); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) + { + float4_t result; + result.vf[0] = 1.0f / sqrtf(_a.vf[0]); + result.vf[1] = 1.0f / sqrtf(_a.vf[1]); + result.vf[2] = 1.0f / sqrtf(_a.vf[2]); + result.vf[3] = 1.0f / sqrtf(_a.vf[3]); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vf == _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vf < _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vf <= _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vf > _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vf >= _b.vf; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) + { + float4_t result; + result.vu = _a.vu & _b.vu; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) + { + float4_t result; + result.vu = _a.vu & ~_b.vu; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) + { + float4_t result; + result.vu = _a.vu | _b.vu; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) + { + float4_t result; + result.vu = _a.vu ^ _b.vu; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count) + { + float4_t result; + const float4_t count = float4_isplat(_count); + result.vu = _a.vu << count.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count) + { + float4_t result; + const float4_t count = float4_isplat(_count); + result.vu = _a.vu >> count.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count) + { + float4_t result; + const float4_t count = float4_isplat(_count); + result.vi = _a.vi >> count.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vi == _b.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vi < _b.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vi > _b.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vi + _b.vi; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) + { + float4_t result; + result.vi = _a.vi - _b.vi; + return result; + } + +} // namespace bx + #endif // BX_FLOAT4_LANGEXT_H_HEADER_GUARD diff --git a/include/bx/float4_neon.h b/include/bx/float4_neon.h index a3c9b85..32bda46 100644 --- a/include/bx/float4_neon.h +++ b/include/bx/float4_neon.h @@ -6,447 +6,6 @@ #ifndef BX_FLOAT4_NEON_H_HEADER_GUARD #define BX_FLOAT4_NEON_H_HEADER_GUARD -#include - -namespace bx -{ - typedef float32x4_t float4_t; - - -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ - { \ - return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ - } - -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw, _swizzle) \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test); \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test); - -IMPLEMENT_TEST(x , xxxx); -IMPLEMENT_TEST(y , yyyy); -IMPLEMENT_TEST(xy , xyyy); -IMPLEMENT_TEST(z , zzzz); -IMPLEMENT_TEST(xz , xzzz); -IMPLEMENT_TEST(yz , yzzz); -IMPLEMENT_TEST(xyz , xyzz); -IMPLEMENT_TEST(w , wwww); -IMPLEMENT_TEST(xw , xwww); -IMPLEMENT_TEST(yw , ywww); -IMPLEMENT_TEST(xyw , xyww); -IMPLEMENT_TEST(zw , zwww); -IMPLEMENT_TEST(xzw , xzww); -IMPLEMENT_TEST(yzw , yzww); -IMPLEMENT_TEST(xyzw , xyzw); - -#undef IMPLEMENT_TEST - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); - } - - BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) - { - return vgetq_lane_f32(_a, 0); - } - - BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) - { - return vgetq_lane_f32(_a, 1); - } - - BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) - { - return vgetq_lane_f32(_a, 2); - } - - BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) - { - return vgetq_lane_f32(_a, 3); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) - { - return vld1q_f32( (const float32_t*)_ptr); - } - - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) - { - vst1q_f32( (float32_t*)_ptr, _a); - } - - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) - { - vst1q_lane_f32( (float32_t*)_ptr, _a, 0); - } - - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) - { - vst1q_f32( (float32_t*)_ptr, _a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) - { - const float32_t val[4] = {_x, _y, _z, _w}; - return float4_ld(val); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - const uint32_t val[4] = {_x, _y, _z, _w}; - const uint32x4_t tmp = vld1q_u32(val); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) - { - const float4_t tmp0 = vld1q_f32( (const float32_t*)_ptr); - const float32x2_t tmp1 = vget_low_f32(tmp0); - const float4_t result = vdupq_lane_f32(tmp1, 0); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) - { - return vdupq_n_f32(_a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) - { - const int32x4_t tmp = vdupq_n_s32(_a); - const float4_t result = vreinterpretq_f32_s32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_zero() - { - return float4_isplat(0); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) - { - const int32x4_t itof = vreinterpretq_s32_f32(_a); - const float4_t result = vcvtq_f32_s32(itof); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) - { - const int32x4_t ftoi = vcvtq_s32_f32(_a); - const float4_t result = vreinterpretq_f32_s32(ftoi); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) - { - return vaddq_f32(_a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) - { - return vsubq_f32(_a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) - { - return vmulq_f32(_a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a) - { - return vrecpeq_f32(_a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) - { - return vrsqrteq_f32(_a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) - { - const uint32x4_t tmp = vceqq_f32(_a, _b); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) - { - const uint32x4_t tmp = vcltq_f32(_a, _b); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) - { - const uint32x4_t tmp = vcleq_f32(_a, _b); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) - { - const uint32x4_t tmp = vcgtq_f32(_a, _b); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) - { - const uint32x4_t tmp = vcgeq_f32(_a, _b); - const float4_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b) - { - return vminq_f32(_a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b) - { - return vmaxq_f32(_a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); - const float4_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const int32x4_t shift = vdupq_n_s32(_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const float4_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); - const float4_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const int32x4_t shift = vdupq_n_s32(-_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const float4_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); - const float4_t result = vreinterpretq_f32_s32(tmp1); - - return result; - } - - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t shift = vdupq_n_s32(-_count); - const int32x4_t tmp1 = vshlq_s32(tmp0, shift); - const float4_t result = vreinterpretq_f32_s32(tmp1); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_madd(float4_t _a, float4_t _b, float4_t _c) - { - return vmlaq_f32(_c, _a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_nmsub(float4_t _a, float4_t _b, float4_t _c) - { - return vmlsq_f32(_c, _a, _b); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); - const float4_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - -} // namespace bx - -#define float4_shuf_xAzC float4_shuf_xAzC_ni -#define float4_shuf_yBwD float4_shuf_yBwD_ni #define float4_rcp float4_rcp_ni #define float4_orx float4_orx_ni #define float4_orc float4_orc_ni @@ -480,45 +39,524 @@ IMPLEMENT_TEST(xyzw , xyzw); namespace bx { -#define IMPLEMENT_TEST(_xyzw, _swizzle) \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_swiz_##_x##_y##_z##_w(float4_neon_t _a) \ { \ - const float4_t tmp0 = float4_swiz_##_swizzle(_test); \ + return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _swizzle) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_neon_t _test) \ + { \ + const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \ return float4_test_any_ni(tmp0); \ } \ \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_neon_t _test) \ { \ - const float4_t tmp0 = float4_swiz_##_swizzle(_test); \ + const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \ return float4_test_all_ni(tmp0); \ } -IMPLEMENT_TEST(x , xxxx); -IMPLEMENT_TEST(y , yyyy); -IMPLEMENT_TEST(xy , xyyy); -IMPLEMENT_TEST(z , zzzz); -IMPLEMENT_TEST(xz , xzzz); -IMPLEMENT_TEST(yz , yzzz); -IMPLEMENT_TEST(xyz , xyzz); -IMPLEMENT_TEST(w , wwww); -IMPLEMENT_TEST(xw , xwww); -IMPLEMENT_TEST(yw , ywww); -IMPLEMENT_TEST(xyw , xyww); -IMPLEMENT_TEST(zw , zwww); -IMPLEMENT_TEST(xzw , xzww); -IMPLEMENT_TEST(yzw , yzww); +IMPLEMENT_TEST(x, xxxx); +IMPLEMENT_TEST(y, yyyy); +IMPLEMENT_TEST(xy, xyyy); +IMPLEMENT_TEST(z, zzzz); +IMPLEMENT_TEST(xz, xzzz); +IMPLEMENT_TEST(yz, yzzz); +IMPLEMENT_TEST(xyz, xyzz); +IMPLEMENT_TEST(w, wwww); +IMPLEMENT_TEST(xw, xwww); +IMPLEMENT_TEST(yw, ywww); +IMPLEMENT_TEST(xyw, xyww); +IMPLEMENT_TEST(zw, zwww); +IMPLEMENT_TEST(xzw, xzww); +IMPLEMENT_TEST(yzw, yzww); +#undef IMPLEMENT_TEST - BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_t _test) + template<> + BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_neon_t _test) { return float4_test_any_ni(_test); } - BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_t _test) + template<> + BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_neon_t _test) { return float4_test_all_ni(_test); } -#undef IMPLEMENT_TEST + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xyAB(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_ABxy(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CDzw(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zwCD(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xAyB(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_yBxA(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zCwD(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CzDw(float4_neon_t _a, float4_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_x(float4_neon_t _a) + { + return vgetq_lane_f32(_a, 0); + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_y(float4_neon_t _a) + { + return vgetq_lane_f32(_a, 1); + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_z(float4_neon_t _a) + { + return vgetq_lane_f32(_a, 2); + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_w(float4_neon_t _a) + { + return vgetq_lane_f32(_a, 3); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(const void* _ptr) + { + return vld1q_f32( (const float32_t*)_ptr); + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_neon_t _a) + { + vst1q_f32( (float32_t*)_ptr, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_neon_t _a) + { + vst1q_lane_f32( (float32_t*)_ptr, _a, 0); + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_neon_t _a) + { + vst1q_f32( (float32_t*)_ptr, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(float _x, float _y, float _z, float _w) + { + const float32_t val[4] = {_x, _y, _z, _w}; + return float4_ld(val); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + const uint32_t val[4] = {_x, _y, _z, _w}; + const uint32x4_t tmp = vld1q_u32(val); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(const void* _ptr) + { + const float4_neon_t tmp0 = vld1q_f32( (const float32_t*)_ptr); + const float32x2_t tmp1 = vget_low_f32(tmp0); + const float4_neon_t result = vdupq_lane_f32(tmp1, 0); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(float _a) + { + return vdupq_n_f32(_a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isplat(uint32_t _a) + { + const int32x4_t tmp = vdupq_n_s32(_a); + const float4_neon_t result = vreinterpretq_f32_s32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_zero() + { + return float4_isplat(0); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_itof(float4_neon_t _a) + { + const int32x4_t itof = vreinterpretq_s32_f32(_a); + const float4_neon_t result = vcvtq_f32_s32(itof); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ftoi(float4_neon_t _a) + { + const int32x4_t ftoi = vcvtq_s32_f32(_a); + const float4_neon_t result = vreinterpretq_f32_s32(ftoi); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_add(float4_neon_t _a, float4_neon_t _b) + { + return vaddq_f32(_a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sub(float4_neon_t _a, float4_neon_t _b) + { + return vsubq_f32(_a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_mul(float4_neon_t _a, float4_neon_t _b) + { + return vmulq_f32(_a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rcp_est(float4_neon_t _a) + { + return vrecpeq_f32(_a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rsqrt_est(float4_neon_t _a) + { + return vrsqrteq_f32(_a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpeq(float4_neon_t _a, float4_neon_t _b) + { + const uint32x4_t tmp = vceqq_f32(_a, _b); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmplt(float4_neon_t _a, float4_neon_t _b) + { + const uint32x4_t tmp = vcltq_f32(_a, _b); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmple(float4_neon_t _a, float4_neon_t _b) + { + const uint32x4_t tmp = vcleq_f32(_a, _b); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpgt(float4_neon_t _a, float4_neon_t _b) + { + const uint32x4_t tmp = vcgtq_f32(_a, _b); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpge(float4_neon_t _a, float4_neon_t _b) + { + const uint32x4_t tmp = vcgeq_f32(_a, _b); + const float4_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_min(float4_neon_t _a, float4_neon_t _b) + { + return vminq_f32(_a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_max(float4_neon_t _a, float4_neon_t _b) + { + return vmaxq_f32(_a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_and(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_andc(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_or(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_xor(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sll(float4_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); + const float4_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const float4_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_srl(float4_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); + const float4_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const float4_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sra(float4_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); + const float4_neon_t result = vreinterpretq_f32_s32(tmp1); + + return result; + } + + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const int32x4_t tmp1 = vshlq_s32(tmp0, shift); + const float4_neon_t result = vreinterpretq_f32_s32(tmp1); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_madd(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c) + { + return vmlaq_f32(_c, _a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_nmsub(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c) + { + return vmlsq_f32(_c, _a, _b); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpeq(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmplt(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpgt(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imin(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imax(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_iadd(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isub(float4_neon_t _a, float4_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); + const float4_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_FLOAT4_INLINE float4_neon_t float4_shuf_xAzC(float4_neon_t _a, float4_neon_t _b) + { + return float4_shuf_xAzC_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_neon_t float4_shuf_yBwD(float4_neon_t _a, float4_neon_t _b) + { + return float4_shuf_yBwD_ni(_a, _b); + } + + typedef float4_neon_t float4_t; + } // namespace bx #endif // BX_FLOAT4_NEON_H_HEADER_GUARD diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h index 644fa6e..a0dc65f 100644 --- a/include/bx/float4_ni.h +++ b/include/bx/float4_ni.h @@ -8,306 +8,334 @@ namespace bx { - BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a); - - BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_shuf_xAzC_ni(Ty _a, Ty _b) { - const float4_t xAyB = float4_shuf_xAyB(_a, _b); - const float4_t zCwD = float4_shuf_zCwD(_a, _b); - const float4_t result = float4_shuf_xyAB(xAyB, zCwD); + const Ty xAyB = float4_shuf_xAyB(_a, _b); + const Ty zCwD = float4_shuf_zCwD(_a, _b); + const Ty result = float4_shuf_xyAB(xAyB, zCwD); return result; } - BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_shuf_yBwD_ni(Ty _a, Ty _b) { - const float4_t xAyB = float4_shuf_xAyB(_a, _b); - const float4_t zCwD = float4_shuf_zCwD(_a, _b); - const float4_t result = float4_shuf_zwCD(xAyB, zCwD); + const Ty xAyB = float4_shuf_xAyB(_a, _b); + const Ty zCwD = float4_shuf_zCwD(_a, _b); + const Ty result = float4_shuf_zwCD(xAyB, zCwD); return result; } - BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c) + template + BX_FLOAT4_INLINE Ty float4_madd_ni(Ty _a, Ty _b, Ty _c) { - const float4_t mul = float4_mul(_a, _b); - const float4_t result = float4_add(mul, _c); + const Ty mul = float4_mul(_a, _b); + const Ty result = float4_add(mul, _c); return result; } - BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c) + template + BX_FLOAT4_INLINE Ty float4_nmsub_ni(Ty _a, Ty _b, Ty _c) { - const float4_t mul = float4_mul(_a, _b); - const float4_t result = float4_sub(_c, mul); + const Ty mul = float4_mul(_a, _b); + const Ty result = float4_sub(_c, mul); return result; } - BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_div_nr_ni(Ty _a, Ty _b) { - const float4_t oneish = float4_isplat(0x3f800001); - const float4_t est = float4_rcp_est(_b); - const float4_t iter0 = float4_mul(_a, est); - const float4_t tmp1 = float4_nmsub(_b, est, oneish); - const float4_t result = float4_madd(tmp1, iter0, iter0); + const Ty oneish = float4_isplat(0x3f800001); + const Ty est = float4_rcp_est(_b); + const Ty iter0 = float4_mul(_a, est); + const Ty tmp1 = float4_nmsub(_b, est, oneish); + const Ty result = float4_madd(tmp1, iter0, iter0); return result; } - BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_rcp_ni(Ty _a) { - const float4_t one = float4_splat(1.0f); - const float4_t result = float4_div(one, _a); + const Ty one = float4_splat(1.0f); + const Ty result = float4_div(one, _a); return result; } - BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_orx_ni(Ty _a) { - const float4_t zwxy = float4_swiz_zwxy(_a); - const float4_t tmp0 = float4_or(_a, zwxy); - const float4_t tmp1 = float4_swiz_yyyy(_a); - const float4_t tmp2 = float4_or(tmp0, tmp1); - const float4_t mf000 = float4_ild(UINT32_MAX, 0, 0, 0); - const float4_t result = float4_and(tmp2, mf000); + const Ty zwxy = float4_swiz_zwxy(_a); + const Ty tmp0 = float4_or(_a, zwxy); + const Ty tmp1 = float4_swiz_yyyy(_a); + const Ty tmp2 = float4_or(tmp0, tmp1); + const Ty mf000 = float4_ild(UINT32_MAX, 0, 0, 0); + const Ty result = float4_and(tmp2, mf000); return result; } - BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_orc_ni(Ty _a, Ty _b) { - const float4_t aorb = float4_or(_a, _b); - const float4_t mffff = float4_isplat(UINT32_MAX); - const float4_t result = float4_xor(aorb, mffff); + const Ty aorb = float4_or(_a, _b); + const Ty mffff = float4_isplat(UINT32_MAX); + const Ty result = float4_xor(aorb, mffff); return result; } - BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_neg_ni(Ty _a) { - const float4_t zero = float4_zero(); - const float4_t result = float4_sub(zero, _a); + const Ty zero = float4_zero(); + const Ty result = float4_sub(zero, _a); return result; } - BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_selb_ni(Ty _mask, Ty _a, Ty _b) { - const float4_t sel_a = float4_and(_a, _mask); - const float4_t sel_b = float4_andc(_b, _mask); - const float4_t result = float4_or(sel_a, sel_b); + const Ty sel_a = float4_and(_a, _mask); + const Ty sel_b = float4_andc(_b, _mask); + const Ty result = float4_or(sel_a, sel_b); return result; } - BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_sels_ni(Ty _test, Ty _a, Ty _b) { - const float4_t mask = float4_sra(_test, 31); - const float4_t result = float4_selb(mask, _a, _b); + const Ty mask = float4_sra(_test, 31); + const Ty result = float4_selb(mask, _a, _b); return result; } - BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_not_ni(Ty _a) { - const float4_t mffff = float4_isplat(UINT32_MAX); - const float4_t result = float4_xor(_a, mffff); + const Ty mffff = float4_isplat(UINT32_MAX); + const Ty result = float4_xor(_a, mffff); return result; } - BX_FLOAT4_INLINE float4_t float4_min_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_min_ni(Ty _a, Ty _b) { - const float4_t mask = float4_cmplt(_a, _b); - const float4_t result = float4_selb(mask, _a, _b); + const Ty mask = float4_cmplt(_a, _b); + const Ty result = float4_selb(mask, _a, _b); return result; } - BX_FLOAT4_INLINE float4_t float4_max_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_max_ni(Ty _a, Ty _b) { - const float4_t mask = float4_cmpgt(_a, _b); - const float4_t result = float4_selb(mask, _a, _b); + const Ty mask = float4_cmpgt(_a, _b); + const Ty result = float4_selb(mask, _a, _b); return result; } - BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_abs_ni(Ty _a) { - const float4_t a_neg = float4_neg(_a); - const float4_t result = float4_max(a_neg, _a); + const Ty a_neg = float4_neg(_a); + const Ty result = float4_max(a_neg, _a); return result; } - BX_FLOAT4_INLINE float4_t float4_imin_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_imin_ni(Ty _a, Ty _b) { - const float4_t mask = float4_icmplt(_a, _b); - const float4_t result = float4_selb(mask, _a, _b); + const Ty mask = float4_icmplt(_a, _b); + const Ty result = float4_selb(mask, _a, _b); return result; } - BX_FLOAT4_INLINE float4_t float4_imax_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_imax_ni(Ty _a, Ty _b) { - const float4_t mask = float4_icmpgt(_a, _b); - const float4_t result = float4_selb(mask, _a, _b); + const Ty mask = float4_icmpgt(_a, _b); + const Ty result = float4_selb(mask, _a, _b); return result; } - BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max) + template + BX_FLOAT4_INLINE Ty float4_clamp_ni(Ty _a, Ty _min, Ty _max) { - const float4_t tmp = float4_min(_a, _max); - const float4_t result = float4_max(tmp, _min); + const Ty tmp = float4_min(_a, _max); + const Ty result = float4_max(tmp, _min); return result; } - BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s) + template + BX_FLOAT4_INLINE Ty float4_lerp_ni(Ty _a, Ty _b, Ty _s) { - const float4_t ba = float4_sub(_b, _a); - const float4_t result = float4_madd(_s, ba, _a); + const Ty ba = float4_sub(_b, _a); + const Ty result = float4_madd(_s, ba, _a); return result; } - BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_sqrt_nr_ni(Ty _a) { - const float4_t half = float4_splat(0.5f); - const float4_t one = float4_splat(1.0f); - const float4_t tmp0 = float4_rsqrt_est(_a); - const float4_t tmp1 = float4_mul(tmp0, _a); - const float4_t tmp2 = float4_mul(tmp1, half); - const float4_t tmp3 = float4_nmsub(tmp0, tmp1, one); - const float4_t result = float4_madd(tmp3, tmp2, tmp1); + const Ty half = float4_splat(0.5f); + const Ty one = float4_splat(1.0f); + const Ty tmp0 = float4_rsqrt_est(_a); + const Ty tmp1 = float4_mul(tmp0, _a); + const Ty tmp2 = float4_mul(tmp1, half); + const Ty tmp3 = float4_nmsub(tmp0, tmp1, one); + const Ty result = float4_madd(tmp3, tmp2, tmp1); return result; } - BX_FLOAT4_INLINE float4_t float4_sqrt_nr1_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_sqrt_nr1_ni(Ty _a) { - const float4_t half = float4_splat(0.5f); + const Ty half = float4_splat(0.5f); - float4_t result = _a; + Ty result = _a; for (uint32_t ii = 0; ii < 11; ++ii) { - const float4_t tmp1 = float4_div(_a, result); - const float4_t tmp2 = float4_add(tmp1, result); + const Ty tmp1 = float4_div(_a, result); + const Ty tmp2 = float4_add(tmp1, result); result = float4_mul(tmp2, half); } return result; } - BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_rsqrt_ni(Ty _a) { - const float4_t one = float4_splat(1.0f); - const float4_t sqrt = float4_sqrt(_a); - const float4_t result = float4_div(one, sqrt); + const Ty one = float4_splat(1.0f); + const Ty sqrt = float4_sqrt(_a); + const Ty result = float4_div(one, sqrt); return result; } - BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_rsqrt_nr_ni(Ty _a) { - const float4_t rsqrt = float4_rsqrt_est(_a); - const float4_t iter0 = float4_mul(_a, rsqrt); - const float4_t iter1 = float4_mul(iter0, rsqrt); - const float4_t half = float4_splat(0.5f); - const float4_t half_rsqrt = float4_mul(half, rsqrt); - const float4_t three = float4_splat(3.0f); - const float4_t three_sub_iter1 = float4_sub(three, iter1); - const float4_t result = float4_mul(half_rsqrt, three_sub_iter1); + const Ty rsqrt = float4_rsqrt_est(_a); + const Ty iter0 = float4_mul(_a, rsqrt); + const Ty iter1 = float4_mul(iter0, rsqrt); + const Ty half = float4_splat(0.5f); + const Ty half_rsqrt = float4_mul(half, rsqrt); + const Ty three = float4_splat(3.0f); + const Ty three_sub_iter1 = float4_sub(three, iter1); + const Ty result = float4_mul(half_rsqrt, three_sub_iter1); return result; } - BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_rsqrt_carmack_ni(Ty _a) { - const float4_t half = float4_splat(0.5f); - const float4_t ah = float4_mul(half, _a); - const float4_t ashift = float4_sra(_a, 1); - const float4_t magic = float4_isplat(0x5f3759df); - const float4_t msuba = float4_isub(magic, ashift); - const float4_t msubasq = float4_mul(msuba, msuba); - const float4_t tmp0 = float4_splat(1.5f); - const float4_t tmp1 = float4_mul(ah, msubasq); - const float4_t tmp2 = float4_sub(tmp0, tmp1); - const float4_t result = float4_mul(msuba, tmp2); + const Ty half = float4_splat(0.5f); + const Ty ah = float4_mul(half, _a); + const Ty ashift = float4_sra(_a, 1); + const Ty magic = float4_isplat(0x5f3759df); + const Ty msuba = float4_isub(magic, ashift); + const Ty msubasq = float4_mul(msuba, msuba); + const Ty tmp0 = float4_splat(1.5f); + const Ty tmp1 = float4_mul(ah, msubasq); + const Ty tmp2 = float4_sub(tmp0, tmp1); + const Ty result = float4_mul(msuba, tmp2); return result; } namespace float4_logexp_detail { - BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c) + template + BX_FLOAT4_INLINE Ty float4_poly1(Ty _a, float _b, float _c) { - const float4_t bbbb = float4_splat(_b); - const float4_t cccc = float4_splat(_c); - const float4_t result = float4_madd(cccc, _a, bbbb); + const Ty bbbb = float4_splat(_b); + const Ty cccc = float4_splat(_c); + const Ty result = float4_madd(cccc, _a, bbbb); return result; } - BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d) + template + BX_FLOAT4_INLINE Ty float4_poly2(Ty _a, float _b, float _c, float _d) { - const float4_t bbbb = float4_splat(_b); - const float4_t poly = float4_poly1(_a, _c, _d); - const float4_t result = float4_madd(poly, _a, bbbb); + const Ty bbbb = float4_splat(_b); + const Ty poly = float4_poly1(_a, _c, _d); + const Ty result = float4_madd(poly, _a, bbbb); return result; } - BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e) + template + BX_FLOAT4_INLINE Ty float4_poly3(Ty _a, float _b, float _c, float _d, float _e) { - const float4_t bbbb = float4_splat(_b); - const float4_t poly = float4_poly2(_a, _c, _d, _e); - const float4_t result = float4_madd(poly, _a, bbbb); + const Ty bbbb = float4_splat(_b); + const Ty poly = float4_poly2(_a, _c, _d, _e); + const Ty result = float4_madd(poly, _a, bbbb); return result; } - BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f) + template + BX_FLOAT4_INLINE Ty float4_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f) { - const float4_t bbbb = float4_splat(_b); - const float4_t poly = float4_poly3(_a, _c, _d, _e, _f); - const float4_t result = float4_madd(poly, _a, bbbb); + const Ty bbbb = float4_splat(_b); + const Ty poly = float4_poly3(_a, _c, _d, _e, _f); + const Ty result = float4_madd(poly, _a, bbbb); return result; } - BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g) + template + BX_FLOAT4_INLINE Ty float4_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g) { - const float4_t bbbb = float4_splat(_b); - const float4_t poly = float4_poly4(_a, _c, _d, _e, _f, _g); - const float4_t result = float4_madd(poly, _a, bbbb); + const Ty bbbb = float4_splat(_b); + const Ty poly = float4_poly4(_a, _c, _d, _e, _f, _g); + const Ty result = float4_madd(poly, _a, bbbb); return result; } - BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_logpoly(Ty _a) { #if 1 - const float4_t result = float4_poly5(_a + const Ty result = float4_poly5(_a , 3.11578814719469302614f, -3.32419399085241980044f , 2.59883907202499966007f, -1.23152682416275988241f , 0.318212422185251071475f, -0.0344359067839062357313f ); #elif 0 - const float4_t result = float4_poly4(_a + const Ty result = float4_poly4(_a , 2.8882704548164776201f, -2.52074962577807006663f , 1.48116647521213171641f, -0.465725644288844778798f , 0.0596515482674574969533f ); #elif 0 - const float4_t result = float4_poly3(_a + const Ty result = float4_poly3(_a , 2.61761038894603480148f, -1.75647175389045657003f , 0.688243882994381274313f, -0.107254423828329604454f ); #else - const float4_t result = float4_poly2(_a + const Ty result = float4_poly2(_a , 2.28330284476918490682f, -1.04913055217340124191f , 0.204446009836232697516f ); @@ -316,27 +344,28 @@ namespace bx return result; } - BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_exppoly(Ty _a) { #if 1 - const float4_t result = float4_poly5(_a + const Ty result = float4_poly5(_a , 9.9999994e-1f, 6.9315308e-1f , 2.4015361e-1f, 5.5826318e-2f , 8.9893397e-3f, 1.8775767e-3f ); #elif 0 - const float4_t result = float4_poly4(_a + const Ty result = float4_poly4(_a , 1.0000026f, 6.9300383e-1f , 2.4144275e-1f, 5.2011464e-2f , 1.3534167e-2f ); #elif 0 - const float4_t result = float4_poly3(_a + const Ty result = float4_poly3(_a , 9.9992520e-1f, 6.9583356e-1f , 2.2606716e-1f, 7.8024521e-2f ); #else - const float4_t result = float4_poly2(_a + const Ty result = float4_poly2(_a , 1.0017247f, 6.5763628e-1f , 3.3718944e-1f ); @@ -346,159 +375,179 @@ namespace bx } } // namespace float4_internal - BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_log2_ni(Ty _a) { - const float4_t expmask = float4_isplat(0x7f800000); - const float4_t mantmask = float4_isplat(0x007fffff); - const float4_t one = float4_splat(1.0f); + const Ty expmask = float4_isplat(0x7f800000); + const Ty mantmask = float4_isplat(0x007fffff); + const Ty one = float4_splat(1.0f); - const float4_t c127 = float4_isplat(127); - const float4_t aexp = float4_and(_a, expmask); - const float4_t aexpsr = float4_srl(aexp, 23); - const float4_t tmp0 = float4_isub(aexpsr, c127); - const float4_t exp = float4_itof(tmp0); + const Ty c127 = float4_isplat(127); + const Ty aexp = float4_and(_a, expmask); + const Ty aexpsr = float4_srl(aexp, 23); + const Ty tmp0 = float4_isub(aexpsr, c127); + const Ty exp = float4_itof(tmp0); - const float4_t amask = float4_and(_a, mantmask); - const float4_t mant = float4_or(amask, one); + const Ty amask = float4_and(_a, mantmask); + const Ty mant = float4_or(amask, one); - const float4_t poly = float4_logexp_detail::float4_logpoly(mant); + const Ty poly = float4_logexp_detail::float4_logpoly(mant); - const float4_t mandiff = float4_sub(mant, one); - const float4_t result = float4_madd(poly, mandiff, exp); + const Ty mandiff = float4_sub(mant, one); + const Ty result = float4_madd(poly, mandiff, exp); return result; } - BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_exp2_ni(Ty _a) { - const float4_t min = float4_splat( 129.0f); - const float4_t max = float4_splat(-126.99999f); - const float4_t tmp0 = float4_min(_a, min); - const float4_t aaaa = float4_max(tmp0, max); + const Ty min = float4_splat( 129.0f); + const Ty max = float4_splat(-126.99999f); + const Ty tmp0 = float4_min(_a, min); + const Ty aaaa = float4_max(tmp0, max); - const float4_t half = float4_splat(0.5f); - const float4_t tmp2 = float4_sub(aaaa, half); - const float4_t ipart = float4_ftoi(tmp2); - const float4_t iround = float4_itof(ipart); - const float4_t fpart = float4_sub(aaaa, iround); + const Ty half = float4_splat(0.5f); + const Ty tmp2 = float4_sub(aaaa, half); + const Ty ipart = float4_ftoi(tmp2); + const Ty iround = float4_itof(ipart); + const Ty fpart = float4_sub(aaaa, iround); - const float4_t c127 = float4_isplat(127); - const float4_t tmp5 = float4_iadd(ipart, c127); - const float4_t expipart = float4_sll(tmp5, 23); + const Ty c127 = float4_isplat(127); + const Ty tmp5 = float4_iadd(ipart, c127); + const Ty expipart = float4_sll(tmp5, 23); - const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart); + const Ty expfpart = float4_logexp_detail::float4_exppoly(fpart); - const float4_t result = float4_mul(expipart, expfpart); + const Ty result = float4_mul(expipart, expfpart); return result; } - BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_pow_ni(Ty _a, Ty _b) { - const float4_t alog2 = float4_log2(_a); - const float4_t alog2b = float4_mul(alog2, _b); - const float4_t result = float4_exp2(alog2b); + const Ty alog2 = float4_log2(_a); + const Ty alog2b = float4_mul(alog2, _b); + const Ty result = float4_exp2(alog2b); return result; } - BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_dot3_ni(Ty _a, Ty _b) { - const float4_t xyzw = float4_mul(_a, _b); - const float4_t xxxx = float4_swiz_xxxx(xyzw); - const float4_t yyyy = float4_swiz_yyyy(xyzw); - const float4_t zzzz = float4_swiz_zzzz(xyzw); - const float4_t tmp1 = float4_add(xxxx, yyyy); - const float4_t result = float4_add(zzzz, tmp1); + const Ty xyzw = float4_mul(_a, _b); + const Ty xxxx = float4_swiz_xxxx(xyzw); + const Ty yyyy = float4_swiz_yyyy(xyzw); + const Ty zzzz = float4_swiz_zzzz(xyzw); + const Ty tmp1 = float4_add(xxxx, yyyy); + const Ty result = float4_add(zzzz, tmp1); return result; } - BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_cross3_ni(Ty _a, Ty _b) { // a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx #if 0 - const float4_t a_yzxw = float4_swiz_yzxw(_a); - const float4_t a_zxyw = float4_swiz_zxyw(_a); - const float4_t b_zxyw = float4_swiz_zxyw(_b); - const float4_t b_yzxw = float4_swiz_yzxw(_b); - const float4_t tmp = float4_mul(a_yzxw, b_zxyw); - const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp); + const Ty a_yzxw = float4_swiz_yzxw(_a); + const Ty a_zxyw = float4_swiz_zxyw(_a); + const Ty b_zxyw = float4_swiz_zxyw(_b); + const Ty b_yzxw = float4_swiz_yzxw(_b); + const Ty tmp = float4_mul(a_yzxw, b_zxyw); + const Ty result = float4_nmsub(a_zxyw, b_yzxw, tmp); #else - const float4_t a_yzxw = float4_swiz_yzxw(_a); - const float4_t b_yzxw = float4_swiz_yzxw(_b); - const float4_t tmp0 = float4_mul(_a, b_yzxw); - const float4_t tmp1 = float4_nmsub(a_yzxw, _b, tmp0); - const float4_t result = float4_swiz_yzxw(tmp1); + const Ty a_yzxw = float4_swiz_yzxw(_a); + const Ty b_yzxw = float4_swiz_yzxw(_b); + const Ty tmp0 = float4_mul(_a, b_yzxw); + const Ty tmp1 = float4_nmsub(a_yzxw, _b, tmp0); + const Ty result = float4_swiz_yzxw(tmp1); #endif return result; } - BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_normalize3_ni(Ty _a) { - const float4_t dot3 = float4_dot3(_a, _a); - const float4_t invSqrt = float4_rsqrt(dot3); - const float4_t result = float4_mul(_a, invSqrt); + const Ty dot3 = float4_dot3(_a, _a); + const Ty invSqrt = float4_rsqrt(dot3); + const Ty result = float4_mul(_a, invSqrt); return result; } - BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b) + template + BX_FLOAT4_INLINE Ty float4_dot_ni(Ty _a, Ty _b) { - const float4_t xyzw = float4_mul(_a, _b); - const float4_t yzwx = float4_swiz_yzwx(xyzw); - const float4_t tmp0 = float4_add(xyzw, yzwx); - const float4_t zwxy = float4_swiz_zwxy(tmp0); - const float4_t result = float4_add(tmp0, zwxy); + const Ty xyzw = float4_mul(_a, _b); + const Ty yzwx = float4_swiz_yzwx(xyzw); + const Ty tmp0 = float4_add(xyzw, yzwx); + const Ty zwxy = float4_swiz_zwxy(tmp0); + const Ty result = float4_add(tmp0, zwxy); return result; } - BX_FLOAT4_INLINE float4_t float4_ceil_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_ceil_ni(Ty _a) { - const float4_t tmp0 = float4_ftoi(_a); - const float4_t tmp1 = float4_itof(tmp0); - const float4_t mask = float4_cmplt(tmp1, _a); - const float4_t one = float4_splat(1.0f); - const float4_t tmp2 = float4_and(one, mask); - const float4_t result = float4_add(tmp1, tmp2); + const Ty tmp0 = float4_ftoi(_a); + const Ty tmp1 = float4_itof(tmp0); + const Ty mask = float4_cmplt(tmp1, _a); + const Ty one = float4_splat(1.0f); + const Ty tmp2 = float4_and(one, mask); + const Ty result = float4_add(tmp1, tmp2); return result; } - BX_FLOAT4_INLINE float4_t float4_floor_ni(float4_t _a) + template + BX_FLOAT4_INLINE Ty float4_floor_ni(Ty _a) { - const float4_t tmp0 = float4_ftoi(_a); - const float4_t tmp1 = float4_itof(tmp0); - const float4_t mask = float4_cmpgt(tmp1, _a); - const float4_t one = float4_splat(1.0f); - const float4_t tmp2 = float4_and(one, mask); - const float4_t result = float4_sub(tmp1, tmp2); + const Ty tmp0 = float4_ftoi(_a); + const Ty tmp1 = float4_itof(tmp0); + const Ty mask = float4_cmpgt(tmp1, _a); + const Ty one = float4_splat(1.0f); + const Ty tmp2 = float4_and(one, mask); + const Ty result = float4_sub(tmp1, tmp2); return result; } - BX_FLOAT4_INLINE bool float4_test_any_ni(float4_t _a) + template + BX_FLOAT4_FORCE_INLINE Ty float4_round_ni(Ty _a) { - const float4_t mask = float4_sra(_a, 31); - const float4_t zwxy = float4_swiz_zwxy(mask); - const float4_t tmp0 = float4_or(mask, zwxy); - const float4_t tmp1 = float4_swiz_yyyy(tmp0); - const float4_t tmp2 = float4_or(tmp0, tmp1); + const Ty tmp = float4_ftoi(_a); + const Ty result = float4_itof(tmp); + + return result; + } + + template + BX_FLOAT4_INLINE bool float4_test_any_ni(Ty _a) + { + const Ty mask = float4_sra(_a, 31); + const Ty zwxy = float4_swiz_zwxy(mask); + const Ty tmp0 = float4_or(mask, zwxy); + const Ty tmp1 = float4_swiz_yyyy(tmp0); + const Ty tmp2 = float4_or(tmp0, tmp1); int res; float4_stx(&res, tmp2); return 0 != res; } - BX_FLOAT4_INLINE bool float4_test_all_ni(float4_t _a) + template + BX_FLOAT4_INLINE bool float4_test_all_ni(Ty _a) { - const float4_t bits = float4_sra(_a, 31); - const float4_t m1248 = float4_ild(1, 2, 4, 8); - const float4_t mask = float4_and(bits, m1248); - const float4_t zwxy = float4_swiz_zwxy(mask); - const float4_t tmp0 = float4_or(mask, zwxy); - const float4_t tmp1 = float4_swiz_yyyy(tmp0); - const float4_t tmp2 = float4_or(tmp0, tmp1); + const Ty bits = float4_sra(_a, 31); + const Ty m1248 = float4_ild(1, 2, 4, 8); + const Ty mask = float4_and(bits, m1248); + const Ty zwxy = float4_swiz_zwxy(mask); + const Ty tmp0 = float4_or(mask, zwxy); + const Ty tmp1 = float4_swiz_yyyy(tmp0); + const Ty tmp2 = float4_or(tmp0, tmp1); int res; float4_stx(&res, tmp2); return 0xf == res; diff --git a/include/bx/float4_ref.h b/include/bx/float4_ref.h index e54862c..0253d65 100644 --- a/include/bx/float4_ref.h +++ b/include/bx/float4_ref.h @@ -8,569 +8,6 @@ #include // sqrtf -namespace bx -{ - typedef union float4_t - { - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - - } float4_t; - -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ - { \ - float4_t result; \ - result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ - result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ - result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ - result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \ - return result; \ - } - -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw, _mask) \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return 0 != (tmp&(_mask) ); \ - } \ - \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return (_mask) == (tmp&(_mask) ); \ - } - -IMPLEMENT_TEST(x , 0x1); -IMPLEMENT_TEST(y , 0x2); -IMPLEMENT_TEST(xy , 0x3); -IMPLEMENT_TEST(z , 0x4); -IMPLEMENT_TEST(xz , 0x5); -IMPLEMENT_TEST(yz , 0x6); -IMPLEMENT_TEST(xyz , 0x7); -IMPLEMENT_TEST(w , 0x8); -IMPLEMENT_TEST(xw , 0x9); -IMPLEMENT_TEST(yw , 0xa); -IMPLEMENT_TEST(xyw , 0xb); -IMPLEMENT_TEST(zw , 0xc); -IMPLEMENT_TEST(xzw , 0xd); -IMPLEMENT_TEST(yzw , 0xe); -IMPLEMENT_TEST(xyzw , 0xf); - -#undef IMPLEMENT_TEST - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1]; - result.uxyzw[2] = _b.uxyzw[0]; - result.uxyzw[3] = _b.uxyzw[1]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _b.uxyzw[0]; - result.uxyzw[1] = _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[0]; - result.uxyzw[3] = _a.uxyzw[1]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _b.uxyzw[2]; - result.uxyzw[1] = _b.uxyzw[3]; - result.uxyzw[2] = _a.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[2]; - result.uxyzw[1] = _a.uxyzw[3]; - result.uxyzw[2] = _b.uxyzw[2]; - result.uxyzw[3] = _b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0]; - result.uxyzw[1] = _b.uxyzw[0]; - result.uxyzw[2] = _a.uxyzw[1]; - result.uxyzw[3] = _b.uxyzw[1]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[1]; - result.uxyzw[1] = _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[0]; - result.uxyzw[3] = _b.uxyzw[0]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[2]; - result.uxyzw[1] = _b.uxyzw[2]; - result.uxyzw[2] = _a.uxyzw[3]; - result.uxyzw[3] = _b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _b.uxyzw[2]; - result.uxyzw[1] = _a.uxyzw[2]; - result.uxyzw[2] = _b.uxyzw[3]; - result.uxyzw[3] = _a.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) - { - return _a.fxyzw[0]; - } - - BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) - { - return _a.fxyzw[1]; - } - - BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) - { - return _a.fxyzw[2]; - } - - BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) - { - return _a.fxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) - { - const uint32_t* input = reinterpret_cast(_ptr); - float4_t result; - result.uxyzw[0] = input[0]; - result.uxyzw[1] = input[1]; - result.uxyzw[2] = input[2]; - result.uxyzw[3] = input[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - } - - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) - { - float4_t result; - result.fxyzw[0] = _x; - result.fxyzw[1] = _y; - result.fxyzw[2] = _z; - result.fxyzw[3] = _w; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - float4_t result; - result.uxyzw[0] = _x; - result.uxyzw[1] = _y; - result.uxyzw[2] = _z; - result.uxyzw[3] = _w; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) - { - const uint32_t val = *reinterpret_cast(_ptr); - float4_t result; - result.uxyzw[0] = val; - result.uxyzw[1] = val; - result.uxyzw[2] = val; - result.uxyzw[3] = val; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) - { - return float4_ld(_a, _a, _a, _a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) - { - return float4_ild(_a, _a, _a, _a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_zero() - { - return float4_ild(0, 0, 0, 0); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) - { - float4_t result; - result.fxyzw[0] = (float)_a.ixyzw[0]; - result.fxyzw[1] = (float)_a.ixyzw[1]; - result.fxyzw[2] = (float)_a.ixyzw[2]; - result.fxyzw[3] = (float)_a.ixyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) - { - float4_t result; - result.ixyzw[0] = (int)_a.fxyzw[0]; - result.ixyzw[1] = (int)_a.fxyzw[1]; - result.ixyzw[2] = (int)_a.fxyzw[2]; - result.ixyzw[3] = (int)_a.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a) - { - const float4_t tmp = float4_ftoi(_a); - const float4_t result = float4_itof(tmp); - - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] / _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a) - { - float4_t result; - result.fxyzw[0] = 1.0f / _a.fxyzw[0]; - result.fxyzw[1] = 1.0f / _a.fxyzw[1]; - result.fxyzw[2] = 1.0f / _a.fxyzw[2]; - result.fxyzw[3] = 1.0f / _a.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a) - { - float4_t result; - result.fxyzw[0] = sqrtf(_a.fxyzw[0]); - result.fxyzw[1] = sqrtf(_a.fxyzw[1]); - result.fxyzw[2] = sqrtf(_a.fxyzw[2]); - result.fxyzw[3] = sqrtf(_a.fxyzw[3]); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) - { - float4_t result; - result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]); - result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]); - result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]); - result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]); - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b) - { - float4_t result; - result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] << _count; - result.uxyzw[1] = _a.uxyzw[1] << _count; - result.uxyzw[2] = _a.uxyzw[2] << _count; - result.uxyzw[3] = _a.uxyzw[3] << _count; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count) - { - float4_t result; - result.uxyzw[0] = _a.uxyzw[0] >> _count; - result.uxyzw[1] = _a.uxyzw[1] >> _count; - result.uxyzw[2] = _a.uxyzw[2] >> _count; - result.uxyzw[3] = _a.uxyzw[3] >> _count; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] >> _count; - result.ixyzw[1] = _a.ixyzw[1] >> _count; - result.ixyzw[2] = _a.ixyzw[2] >> _count; - result.ixyzw[3] = _a.ixyzw[3] >> _count; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] == _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3]; - return result; - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) - { - float4_t result; - result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3]; - return result; - } - -} // namespace bx - #define float4_shuf_xAzC float4_shuf_xAzC_ni #define float4_shuf_yBwD float4_shuf_yBwD_ni #define float4_rcp float4_rcp_ni @@ -599,6 +36,615 @@ IMPLEMENT_TEST(xyzw , 0xf); #define float4_dot float4_dot_ni #define float4_ceil float4_ceil_ni #define float4_floor float4_floor_ni + #include "float4_ni.h" +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_swiz_##_x##_y##_z##_w(float4_ref_t _a) \ + { \ + float4_ref_t result; \ + result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ + result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ + result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ + result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \ + return result; \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_ref_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | ( _test.uxyzw[0]>>31) \ + ; \ + return 0 != (tmp&(_mask) ); \ + } \ + \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_ref_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | ( _test.uxyzw[0]>>31) \ + ; \ + return (_mask) == (tmp&(_mask) ); \ + } + +IMPLEMENT_TEST(x , 0x1); +IMPLEMENT_TEST(y , 0x2); +IMPLEMENT_TEST(xy , 0x3); +IMPLEMENT_TEST(z , 0x4); +IMPLEMENT_TEST(xz , 0x5); +IMPLEMENT_TEST(yz , 0x6); +IMPLEMENT_TEST(xyz , 0x7); +IMPLEMENT_TEST(w , 0x8); +IMPLEMENT_TEST(xw , 0x9); +IMPLEMENT_TEST(yw , 0xa); +IMPLEMENT_TEST(xyw , 0xb); +IMPLEMENT_TEST(zw , 0xc); +IMPLEMENT_TEST(xzw , 0xd); +IMPLEMENT_TEST(yzw , 0xe); +IMPLEMENT_TEST(xyzw , 0xf); + +#undef IMPLEMENT_TEST + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xyAB(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1]; + result.uxyzw[2] = _b.uxyzw[0]; + result.uxyzw[3] = _b.uxyzw[1]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_ABxy(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _b.uxyzw[0]; + result.uxyzw[1] = _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[0]; + result.uxyzw[3] = _a.uxyzw[1]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CDzw(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _b.uxyzw[2]; + result.uxyzw[1] = _b.uxyzw[3]; + result.uxyzw[2] = _a.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zwCD(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[2]; + result.uxyzw[1] = _a.uxyzw[3]; + result.uxyzw[2] = _b.uxyzw[2]; + result.uxyzw[3] = _b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xAyB(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0]; + result.uxyzw[1] = _b.uxyzw[0]; + result.uxyzw[2] = _a.uxyzw[1]; + result.uxyzw[3] = _b.uxyzw[1]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_yBxA(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[1]; + result.uxyzw[1] = _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[0]; + result.uxyzw[3] = _b.uxyzw[0]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zCwD(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[2]; + result.uxyzw[1] = _b.uxyzw[2]; + result.uxyzw[2] = _a.uxyzw[3]; + result.uxyzw[3] = _b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CzDw(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _b.uxyzw[2]; + result.uxyzw[1] = _a.uxyzw[2]; + result.uxyzw[2] = _b.uxyzw[3]; + result.uxyzw[3] = _a.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_x(float4_ref_t _a) + { + return _a.fxyzw[0]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_y(float4_ref_t _a) + { + return _a.fxyzw[1]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_z(float4_ref_t _a) + { + return _a.fxyzw[2]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float float4_w(float4_ref_t _a) + { + return _a.fxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(const void* _ptr) + { + const uint32_t* input = reinterpret_cast(_ptr); + float4_ref_t result; + result.uxyzw[0] = input[0]; + result.uxyzw[1] = input[1]; + result.uxyzw[2] = input[2]; + result.uxyzw[3] = input[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_ref_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_ref_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + } + + template<> + BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_ref_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + result[0] = _a.uxyzw[0]; + result[1] = _a.uxyzw[1]; + result[2] = _a.uxyzw[2]; + result[3] = _a.uxyzw[3]; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(float _x, float _y, float _z, float _w) + { + float4_ref_t result; + result.fxyzw[0] = _x; + result.fxyzw[1] = _y; + result.fxyzw[2] = _z; + result.fxyzw[3] = _w; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + float4_ref_t result; + result.uxyzw[0] = _x; + result.uxyzw[1] = _y; + result.uxyzw[2] = _z; + result.uxyzw[3] = _w; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(const void* _ptr) + { + const uint32_t val = *reinterpret_cast(_ptr); + float4_ref_t result; + result.uxyzw[0] = val; + result.uxyzw[1] = val; + result.uxyzw[2] = val; + result.uxyzw[3] = val; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(float _a) + { + return float4_ld(_a, _a, _a, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isplat(uint32_t _a) + { + return float4_ild(_a, _a, _a, _a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_zero() + { + return float4_ild(0, 0, 0, 0); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_itof(float4_ref_t _a) + { + float4_ref_t result; + result.fxyzw[0] = (float)_a.ixyzw[0]; + result.fxyzw[1] = (float)_a.ixyzw[1]; + result.fxyzw[2] = (float)_a.ixyzw[2]; + result.fxyzw[3] = (float)_a.ixyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ftoi(float4_ref_t _a) + { + float4_ref_t result; + result.ixyzw[0] = (int)_a.fxyzw[0]; + result.ixyzw[1] = (int)_a.fxyzw[1]; + result.ixyzw[2] = (int)_a.fxyzw[2]; + result.ixyzw[3] = (int)_a.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_round(float4_ref_t _a) + { + return float4_round_ni(_a); + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_add(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sub(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_mul(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_div(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] / _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rcp_est(float4_ref_t _a) + { + float4_ref_t result; + result.fxyzw[0] = 1.0f / _a.fxyzw[0]; + result.fxyzw[1] = 1.0f / _a.fxyzw[1]; + result.fxyzw[2] = 1.0f / _a.fxyzw[2]; + result.fxyzw[3] = 1.0f / _a.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sqrt(float4_ref_t _a) + { + float4_ref_t result; + result.fxyzw[0] = sqrtf(_a.fxyzw[0]); + result.fxyzw[1] = sqrtf(_a.fxyzw[1]); + result.fxyzw[2] = sqrtf(_a.fxyzw[2]); + result.fxyzw[3] = sqrtf(_a.fxyzw[3]); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rsqrt_est(float4_ref_t _a) + { + float4_ref_t result; + result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]); + result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]); + result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]); + result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]); + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpeq(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmplt(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmple(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpgt(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpge(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_min(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_max(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_and(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_andc(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_or(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_xor(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sll(float4_ref_t _a, int _count) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] << _count; + result.uxyzw[1] = _a.uxyzw[1] << _count; + result.uxyzw[2] = _a.uxyzw[2] << _count; + result.uxyzw[3] = _a.uxyzw[3] << _count; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_srl(float4_ref_t _a, int _count) + { + float4_ref_t result; + result.uxyzw[0] = _a.uxyzw[0] >> _count; + result.uxyzw[1] = _a.uxyzw[1] >> _count; + result.uxyzw[2] = _a.uxyzw[2] >> _count; + result.uxyzw[3] = _a.uxyzw[3] >> _count; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sra(float4_ref_t _a, int _count) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] >> _count; + result.ixyzw[1] = _a.ixyzw[1] >> _count; + result.ixyzw[2] = _a.ixyzw[2] >> _count; + result.ixyzw[3] = _a.ixyzw[3] >> _count; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpeq(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.ixyzw[3] == _b.ixyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmplt(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpgt(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? 0xffffffff : 0x0; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imin(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imax(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_iadd(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3]; + return result; + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isub(float4_ref_t _a, float4_ref_t _b) + { + float4_ref_t result; + result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3]; + return result; + } + +} // namespace bx + #endif // BX_FLOAT4_REF_H_HEADER_GUARD diff --git a/include/bx/float4_sse.h b/include/bx/float4_sse.h index 7327251..f5e91fe 100644 --- a/include/bx/float4_sse.h +++ b/include/bx/float4_sse.h @@ -6,22 +6,17 @@ #ifndef BX_FLOAT4_SSE_H_HEADER_GUARD #define BX_FLOAT4_SSE_H_HEADER_GUARD -#include // __m128i -#if defined(__SSE4_1__) -# include -#endif // defined(__SSE4_1__) -#include // __m128 +#include "float4_ni.h" namespace bx { - typedef __m128 float4_t; - #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 #define ELEMw 3 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ + template<> \ + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_swiz_##_x##_y##_z##_w(float4_sse_t _a) \ { \ return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \ } @@ -35,12 +30,14 @@ namespace bx #undef ELEMx #define IMPLEMENT_TEST(_xyzw, _mask) \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_sse_t _test) \ { \ return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ } \ \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ + template<> \ + BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_sse_t _test) \ { \ return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ } @@ -63,399 +60,588 @@ IMPLEMENT_TEST(xyzw , 0xf); #undef IMPLEMENT_TEST - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xyAB(float4_sse_t _a, float4_sse_t _b) { return _mm_movelh_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_ABxy(float4_sse_t _a, float4_sse_t _b) { return _mm_movelh_ps(_b, _a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CDzw(float4_sse_t _a, float4_sse_t _b) { return _mm_movehl_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zwCD(float4_sse_t _a, float4_sse_t _b) { return _mm_movehl_ps(_b, _a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xAyB(float4_sse_t _a, float4_sse_t _b) { return _mm_unpacklo_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_yBxA(float4_sse_t _a, float4_sse_t _b) { return _mm_unpacklo_ps(_b, _a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zCwD(float4_sse_t _a, float4_sse_t _b) { return _mm_unpackhi_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CzDw(float4_sse_t _a, float4_sse_t _b) { return _mm_unpackhi_ps(_b, _a); } - BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float float4_x(float4_sse_t _a) { return _mm_cvtss_f32(_a); } - BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float float4_y(float4_sse_t _a) { - const float4_t yyyy = float4_swiz_yyyy(_a); + const float4_sse_t yyyy = float4_swiz_yyyy(_a); const float result = _mm_cvtss_f32(yyyy); return result; } - BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float float4_z(float4_sse_t _a) { - const float4_t zzzz = float4_swiz_zzzz(_a); + const float4_sse_t zzzz = float4_swiz_zzzz(_a); const float result = _mm_cvtss_f32(zzzz); return result; } - BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float float4_w(float4_sse_t _a) { - const float4_t wwww = float4_swiz_wwww(_a); + const float4_sse_t wwww = float4_swiz_wwww(_a); const float result = _mm_cvtss_f32(wwww); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(const void* _ptr) { return _mm_load_ps(reinterpret_cast(_ptr) ); } - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_sse_t _a) { _mm_store_ps(reinterpret_cast(_ptr), _a); } - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_sse_t _a) { _mm_store_ss(reinterpret_cast(_ptr), _a); } - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_sse_t _a) { _mm_stream_ps(reinterpret_cast(_ptr), _a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(float _x, float _y, float _z, float _w) { return _mm_set_ps(_w, _z, _y, _x); } - BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { const __m128i set = _mm_set_epi32(_w, _z, _y, _x); - const float4_t result = _mm_castsi128_ps(set); - + const float4_sse_t result = _mm_castsi128_ps(set); + return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(const void* _ptr) { - const float4_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); - const float4_t result = float4_swiz_xxxx(x___); + const float4_sse_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); + const float4_sse_t result = float4_swiz_xxxx(x___); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(float _a) { return _mm_set1_ps(_a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isplat(uint32_t _a) { const __m128i splat = _mm_set1_epi32(_a); - const float4_t result = _mm_castsi128_ps(splat); + const float4_sse_t result = _mm_castsi128_ps(splat); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_zero() + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_zero() { return _mm_setzero_ps(); } - BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_itof(float4_sse_t _a) { const __m128i itof = _mm_castps_si128(_a); - const float4_t result = _mm_cvtepi32_ps(itof); + const float4_sse_t result = _mm_cvtepi32_ps(itof); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ftoi(float4_sse_t _a) { const __m128i ftoi = _mm_cvtps_epi32(_a); - const float4_t result = _mm_castsi128_ps(ftoi); + const float4_sse_t result = _mm_castsi128_ps(ftoi); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_round(float4_sse_t _a) { #if defined(__SSE4_1__) return _mm_round_ps(_a, _MM_FROUND_NINT); #else const __m128i round = _mm_cvtps_epi32(_a); - const float4_t result = _mm_cvtepi32_ps(round); + const float4_sse_t result = _mm_cvtepi32_ps(round); return result; #endif // defined(__SSE4_1__) } - BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_add(float4_sse_t _a, float4_sse_t _b) { return _mm_add_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sub(float4_sse_t _a, float4_sse_t _b) { return _mm_sub_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_mul(float4_sse_t _a, float4_sse_t _b) { return _mm_mul_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_div(float4_sse_t _a, float4_sse_t _b) { return _mm_div_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rcp_est(float4_sse_t _a) { return _mm_rcp_ps(_a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sqrt(float4_sse_t _a) { return _mm_sqrt_ps(_a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rsqrt_est(float4_sse_t _a) { return _mm_rsqrt_ps(_a); } + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot3(float4_sse_t _a, float4_sse_t _b) + { #if defined(__SSE4_1__) - BX_FLOAT4_FORCE_INLINE float4_t float4_dot3(float4_t _a, float4_t _b) - { return _mm_dp_ps(_a, _b, 0x77); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_dot(float4_t _a, float4_t _b) - { - return _mm_dp_ps(_a, _b, 0xFF); - } +#else + return float4_dot3_ni(_a, _b); #endif // defined(__SSE4__) + } - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot(float4_sse_t _a, float4_sse_t _b) + { +#if defined(__SSE4_1__) + return _mm_dp_ps(_a, _b, 0xFF); +#else + return float4_dot_ni(_a, _b); +#endif // defined(__SSE4__) + } + + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpeq(float4_sse_t _a, float4_sse_t _b) { return _mm_cmpeq_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmplt(float4_sse_t _a, float4_sse_t _b) { return _mm_cmplt_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmple(float4_sse_t _a, float4_sse_t _b) { return _mm_cmple_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpgt(float4_sse_t _a, float4_sse_t _b) { return _mm_cmpgt_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpge(float4_sse_t _a, float4_sse_t _b) { return _mm_cmpge_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_min(float4_sse_t _a, float4_sse_t _b) { return _mm_min_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_max(float4_sse_t _a, float4_sse_t _b) { return _mm_max_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_and(float4_sse_t _a, float4_sse_t _b) { return _mm_and_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_andc(float4_sse_t _a, float4_sse_t _b) { return _mm_andnot_ps(_b, _a); } - BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_or(float4_sse_t _a, float4_sse_t _b) { return _mm_or_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_xor(float4_sse_t _a, float4_sse_t _b) { return _mm_xor_ps(_a, _b); } - BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sll(float4_sse_t _a, int _count) { const __m128i a = _mm_castps_si128(_a); const __m128i shift = _mm_slli_epi32(a, _count); - const float4_t result = _mm_castsi128_ps(shift); + const float4_sse_t result = _mm_castsi128_ps(shift); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_srl(float4_sse_t _a, int _count) { const __m128i a = _mm_castps_si128(_a); const __m128i shift = _mm_srli_epi32(a, _count); - const float4_t result = _mm_castsi128_ps(shift); + const float4_sse_t result = _mm_castsi128_ps(shift); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sra(float4_sse_t _a, int _count) { const __m128i a = _mm_castps_si128(_a); const __m128i shift = _mm_srai_epi32(a, _count); - const float4_t result = _mm_castsi128_ps(shift); + const float4_sse_t result = _mm_castsi128_ps(shift); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpeq(float4_sse_t _a, float4_sse_t _b) { const __m128i tmp0 = _mm_castps_si128(_a); const __m128i tmp1 = _mm_castps_si128(_b); const __m128i tmp2 = _mm_cmpeq_epi32(tmp0, tmp1); - const float4_t result = _mm_castsi128_ps(tmp2); + const float4_sse_t result = _mm_castsi128_ps(tmp2); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmplt(float4_sse_t _a, float4_sse_t _b) { const __m128i tmp0 = _mm_castps_si128(_a); const __m128i tmp1 = _mm_castps_si128(_b); const __m128i tmp2 = _mm_cmplt_epi32(tmp0, tmp1); - const float4_t result = _mm_castsi128_ps(tmp2); + const float4_sse_t result = _mm_castsi128_ps(tmp2); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpgt(float4_sse_t _a, float4_sse_t _b) { const __m128i tmp0 = _mm_castps_si128(_a); const __m128i tmp1 = _mm_castps_si128(_b); const __m128i tmp2 = _mm_cmpgt_epi32(tmp0, tmp1); - const float4_t result = _mm_castsi128_ps(tmp2); + const float4_sse_t result = _mm_castsi128_ps(tmp2); return result; } -#if defined(__SSE4_1__) - BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imin(float4_sse_t _a, float4_sse_t _b) { +#if defined(__SSE4_1__) const __m128i tmp0 = _mm_castps_si128(_a); const __m128i tmp1 = _mm_castps_si128(_b); const __m128i tmp2 = _mm_min_epi32(tmp0, tmp1); - const float4_t result = _mm_castsi128_ps(tmp2); + const float4_sse_t result = _mm_castsi128_ps(tmp2); return result; +#else + return float4_imin_ni(_a, _b); +#endif // defined(__SSE4_1__) } - BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imax(float4_sse_t _a, float4_sse_t _b) { +#if defined(__SSE4_1__) const __m128i tmp0 = _mm_castps_si128(_a); const __m128i tmp1 = _mm_castps_si128(_b); const __m128i tmp2 = _mm_max_epi32(tmp0, tmp1); - const float4_t result = _mm_castsi128_ps(tmp2); + const float4_sse_t result = _mm_castsi128_ps(tmp2); return result; - } +#else + return float4_imax_ni(_a, _b); #endif // defined(__SSE4_1__) + } - BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_iadd(float4_sse_t _a, float4_sse_t _b) { const __m128i a = _mm_castps_si128(_a); const __m128i b = _mm_castps_si128(_b); const __m128i add = _mm_add_epi32(a, b); - const float4_t result = _mm_castsi128_ps(add); + const float4_sse_t result = _mm_castsi128_ps(add); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b) + template<> + BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isub(float4_sse_t _a, float4_sse_t _b) { const __m128i a = _mm_castps_si128(_a); const __m128i b = _mm_castps_si128(_b); const __m128i sub = _mm_sub_epi32(a, b); - const float4_t result = _mm_castsi128_ps(sub); + const float4_sse_t result = _mm_castsi128_ps(sub); return result; } + template<> + BX_FLOAT4_INLINE float4_sse_t float4_shuf_xAzC(float4_sse_t _a, float4_sse_t _b) + { + return float4_shuf_xAzC_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_shuf_yBwD(float4_sse_t _a, float4_sse_t _b) + { + return float4_shuf_yBwD_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_rcp(float4_sse_t _a) + { + return float4_rcp_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_orx(float4_sse_t _a) + { + return float4_orx_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_orc(float4_sse_t _a, float4_sse_t _b) + { + return float4_orc_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_neg(float4_sse_t _a) + { + return float4_neg_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_madd(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c) + { + return float4_madd_ni(_a, _b, _c); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_nmsub(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c) + { + return float4_nmsub_ni(_a, _b, _c); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_div_nr(float4_sse_t _a, float4_sse_t _b) + { + return float4_div_nr_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_selb(float4_sse_t _mask, float4_sse_t _a, float4_sse_t _b) + { + return float4_selb_ni(_mask, _a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_sels(float4_sse_t _test, float4_sse_t _a, float4_sse_t _b) + { + return float4_sels_ni(_test, _a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_not(float4_sse_t _a) + { + return float4_not_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_abs(float4_sse_t _a) + { + return float4_abs_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_clamp(float4_sse_t _a, float4_sse_t _min, float4_sse_t _max) + { + return float4_clamp_ni(_a, _min, _max); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_lerp(float4_sse_t _a, float4_sse_t _b, float4_sse_t _s) + { + return float4_lerp_ni(_a, _b, _s); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_rsqrt(float4_sse_t _a) + { + return float4_rsqrt_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_nr(float4_sse_t _a) + { + return float4_rsqrt_nr_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_carmack(float4_sse_t _a) + { + return float4_rsqrt_carmack_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_sqrt_nr(float4_sse_t _a) + { + return float4_sqrt_nr_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_log2(float4_sse_t _a) + { + return float4_log2_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_exp2(float4_sse_t _a) + { + return float4_exp2_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_pow(float4_sse_t _a, float4_sse_t _b) + { + return float4_pow_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_cross3(float4_sse_t _a, float4_sse_t _b) + { + return float4_cross3_ni(_a, _b); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_normalize3(float4_sse_t _a) + { + return float4_normalize3_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_ceil(float4_sse_t _a) + { + return float4_ceil_ni(_a); + } + + template<> + BX_FLOAT4_INLINE float4_sse_t float4_floor(float4_sse_t _a) + { + return float4_floor_ni(_a); + } + + typedef float4_sse_t float4_t; + } // namespace bx -#define float4_shuf_xAzC float4_shuf_xAzC_ni -#define float4_shuf_yBwD float4_shuf_yBwD_ni -#define float4_rcp float4_rcp_ni -#define float4_orx float4_orx_ni -#define float4_orc float4_orc_ni -#define float4_neg float4_neg_ni -#define float4_madd float4_madd_ni -#define float4_nmsub float4_nmsub_ni -#define float4_div_nr float4_div_nr_ni -#define float4_selb float4_selb_ni -#define float4_sels float4_sels_ni -#define float4_not float4_not_ni -#define float4_abs float4_abs_ni -#define float4_clamp float4_clamp_ni -#define float4_lerp float4_lerp_ni -#define float4_rsqrt float4_rsqrt_ni -#define float4_rsqrt_nr float4_rsqrt_nr_ni -#define float4_rsqrt_carmack float4_rsqrt_carmack_ni -#define float4_sqrt_nr float4_sqrt_nr_ni -#define float4_log2 float4_log2_ni -#define float4_exp2 float4_exp2_ni -#define float4_pow float4_pow_ni -#define float4_cross3 float4_cross3_ni -#define float4_normalize3 float4_normalize3_ni -#define float4_ceil float4_ceil_ni -#define float4_floor float4_floor_ni - -#if !defined(__SSE4_1__) -# define float4_dot3 float4_dot3_ni -# define float4_dot float4_dot_ni -# define float4_imin float4_imin_ni -# define float4_imax float4_imax_ni -#endif // defined(__SSE4_1__) - -#include "float4_ni.h" - #endif // BX_FLOAT4_SSE_H_HEADER_GUARD diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h index 78fd4e8..70bad40 100644 --- a/include/bx/float4_t.h +++ b/include/bx/float4_t.h @@ -9,7 +9,347 @@ #include "bx.h" #define BX_FLOAT4_FORCE_INLINE BX_FORCE_INLINE -#define BX_FLOAT4_INLINE static inline +#define BX_FLOAT4_INLINE inline + +#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) ) +# include // __m128i +# if defined(__SSE4_1__) +# include +# endif // defined(__SSE4_1__) +# include // __m128 + +namespace bx +{ + typedef __m128 float4_sse_t; + +} // namespace bx + +#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG +# include + +namespace bx +{ + typedef float32x4_t float4_neon_t; + +} // namespace bx + +#elif BX_COMPILER_CLANG \ + && !BX_PLATFORM_EMSCRIPTEN \ + && !BX_PLATFORM_IOS \ + && BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) +# include + +namespace bx +{ + typedef union float4_langext_t + { + float __attribute__((vector_size(16))) vf; + int32_t __attribute__((vector_size(16))) vi; + uint32_t __attribute__((vector_size(16))) vu; + float fxyzw[4]; + int32_t ixyzw[4]; + uint32_t uxyzw[4]; + + } float4_langext_t; +} // namespace bx +#endif // + +namespace bx +{ + typedef union float4_ref_t + { + float fxyzw[4]; + int32_t ixyzw[4]; + uint32_t uxyzw[4]; + + } float4_ref_t; +} // namespace bx + +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template \ + BX_FLOAT4_FORCE_INLINE Ty float4_swiz_##_x##_y##_z##_w(Ty _a); +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw) \ + template \ + BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(Ty _test); \ + \ + template \ + BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(Ty _test) + +IMPLEMENT_TEST(x ); +IMPLEMENT_TEST(y ); +IMPLEMENT_TEST(xy ); +IMPLEMENT_TEST(z ); +IMPLEMENT_TEST(xz ); +IMPLEMENT_TEST(yz ); +IMPLEMENT_TEST(xyz ); +IMPLEMENT_TEST(w ); +IMPLEMENT_TEST(xw ); +IMPLEMENT_TEST(yw ); +IMPLEMENT_TEST(xyw ); +IMPLEMENT_TEST(zw ); +IMPLEMENT_TEST(xzw ); +IMPLEMENT_TEST(yzw ); +IMPLEMENT_TEST(xyzw); +#undef IMPLEMENT_TEST + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xyAB(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_ABxy(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CDzw(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zwCD(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xAyB(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_yBxA(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zCwD(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CzDw(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE float float4_x(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE float float4_y(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE float float4_z(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE float float4_w(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_ld(const void* _ptr); + + template + BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, Ty _a); + + template + BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, Ty _a); + + template + BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_ld(float _x, float _y, float _z, float _w); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_splat(const void* _ptr); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_splat(float _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_isplat(uint32_t _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_zero(); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_itof(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_ftoi(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_round(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_add(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_sub(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_mul(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_div(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_rcp_est(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_sqrt(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_rsqrt_est(Ty _a); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_dot3(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_dot(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_cmpeq(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_cmplt(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_cmple(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_cmpgt(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_cmpge(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_min(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_max(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_and(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_andc(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_or(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_xor(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_sll(Ty _a, int _count); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_srl(Ty _a, int _count); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_sra(Ty _a, int _count); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_icmpeq(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_icmplt(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_icmpgt(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_imin(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_imax(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_iadd(Ty _a, Ty _b); + + template + BX_FLOAT4_FORCE_INLINE Ty float4_isub(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_shuf_xAzC(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_shuf_yBwD(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_rcp(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_orx(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_orc(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_neg(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_madd(Ty _a, Ty _b, Ty _c); + + template + BX_FLOAT4_INLINE Ty float4_nmsub(Ty _a, Ty _b, Ty _c); + + template + BX_FLOAT4_INLINE Ty float4_div_nr(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_selb(Ty _mask, Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_sels(Ty _test, Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_not(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_abs(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_clamp(Ty _a, Ty _min, Ty _max); + + template + BX_FLOAT4_INLINE Ty float4_lerp(Ty _a, Ty _b, Ty _s); + + template + BX_FLOAT4_INLINE Ty float4_rsqrt(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_rsqrt_nr(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_rsqrt_carmack(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_sqrt_nr(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_log2(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_exp2(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_pow(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_cross3(Ty _a, Ty _b); + + template + BX_FLOAT4_INLINE Ty float4_normalize3(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_ceil(Ty _a); + + template + BX_FLOAT4_INLINE Ty float4_floor(Ty _a); + +} // namespace bx #if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) ) # include "float4_sse.h" @@ -29,7 +369,50 @@ # pragma message("************************************\nUsing SIMD reference implementation!\n************************************") # endif // BX_FLOAT4_WARN_REFERENCE_IMPL -# include "float4_ref.h" +namespace bx +{ + typedef float4_ref_t float4_t; +} #endif // +#include "float4_ref.h" + +namespace bx +{ + BX_FLOAT4_FORCE_INLINE float4_t float4_zero() + { + return float4_zero(); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) + { + return float4_ld(_ptr); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + { + return float4_ld(_x, _y, _z, _w); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + return float4_ild(_x, _y, _z, _w); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) + { + return float4_splat(_ptr); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) + { + return float4_splat(_a); + } + + BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) + { + return float4_isplat(_a); + } +} + #endif // BX_FLOAT4_T_H_HEADER_GUARD diff --git a/include/bx/macros.h b/include/bx/macros.h index e8780b1..870f872 100644 --- a/include/bx/macros.h +++ b/include/bx/macros.h @@ -63,7 +63,7 @@ #if BX_COMPILER_GCC || BX_COMPILER_CLANG # define BX_ALIGN_DECL(_align, _decl) _decl __attribute__( (aligned(_align) ) ) # define BX_ALLOW_UNUSED __attribute__( (unused) ) -# define BX_FORCE_INLINE __extension__ static __inline __attribute__( (__always_inline__) ) +# define BX_FORCE_INLINE inline __attribute__( (__always_inline__) ) # define BX_FUNCTION __PRETTY_FUNCTION__ # define BX_LIKELY(_x) __builtin_expect(!!(_x), 1) # define BX_UNLIKELY(_x) __builtin_expect(!!(_x), 0) @@ -71,7 +71,7 @@ # define BX_NO_RETURN __attribute__( (noreturn) ) # define BX_NO_VTABLE # define BX_OVERRIDE -# define BX_PRINTF_ARGS(_format, _args) __attribute__ ( (format(__printf__, _format, _args) ) ) +# define BX_PRINTF_ARGS(_format, _args) __attribute__( (format(__printf__, _format, _args) ) ) # if BX_CLANG_HAS_FEATURE(cxx_thread_local) # define BX_THREAD_LOCAL __thread # endif // BX_COMPILER_CLANG