mirror of
https://github.com/bkaradzic/bx.git
synced 2026-02-17 20:52:37 +01:00
Refactoring SIMD code.
This commit is contained in:
@@ -6,446 +6,6 @@
|
||||
#ifndef BX_FLOAT4_LANGEXT_H_HEADER_GUARD
|
||||
#define BX_FLOAT4_LANGEXT_H_HEADER_GUARD
|
||||
|
||||
#include <math.h>
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef union float4_t
|
||||
{
|
||||
float __attribute__((vector_size(16))) vf;
|
||||
int32_t __attribute__((vector_size(16))) vi;
|
||||
uint32_t __attribute__((vector_size(16))) vu;
|
||||
float fxyzw[4];
|
||||
int32_t ixyzw[4];
|
||||
uint32_t uxyzw[4];
|
||||
|
||||
} float4_t;
|
||||
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
|
||||
{ \
|
||||
float4_t result; \
|
||||
result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
#include "float4_swizzle.inl"
|
||||
|
||||
#undef IMPLEMENT_SWIZZLE
|
||||
#undef ELEMw
|
||||
#undef ELEMz
|
||||
#undef ELEMy
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw, _mask) \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
|
||||
{ \
|
||||
uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
|
||||
| ( (_test.uxyzw[2]>>31)<<2) \
|
||||
| ( (_test.uxyzw[1]>>31)<<1) \
|
||||
| ( _test.uxyzw[0]>>31) \
|
||||
; \
|
||||
return 0 != (tmp&(_mask) ); \
|
||||
} \
|
||||
\
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
|
||||
{ \
|
||||
uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
|
||||
| ( (_test.uxyzw[2]>>31)<<2) \
|
||||
| ( (_test.uxyzw[1]>>31)<<1) \
|
||||
| ( _test.uxyzw[0]>>31) \
|
||||
; \
|
||||
return (_mask) == (tmp&(_mask) ); \
|
||||
}
|
||||
|
||||
IMPLEMENT_TEST(x , 0x1);
|
||||
IMPLEMENT_TEST(y , 0x2);
|
||||
IMPLEMENT_TEST(xy , 0x3);
|
||||
IMPLEMENT_TEST(z , 0x4);
|
||||
IMPLEMENT_TEST(xz , 0x5);
|
||||
IMPLEMENT_TEST(yz , 0x6);
|
||||
IMPLEMENT_TEST(xyz , 0x7);
|
||||
IMPLEMENT_TEST(w , 0x8);
|
||||
IMPLEMENT_TEST(xw , 0x9);
|
||||
IMPLEMENT_TEST(yw , 0xa);
|
||||
IMPLEMENT_TEST(xyw , 0xb);
|
||||
IMPLEMENT_TEST(zw , 0xc);
|
||||
IMPLEMENT_TEST(xzw , 0xd);
|
||||
IMPLEMENT_TEST(yzw , 0xe);
|
||||
IMPLEMENT_TEST(xyzw , 0xf);
|
||||
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 5, 7, 2, 3);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 5, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAzC(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBwD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[0];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[1];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[2];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[3];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
|
||||
{
|
||||
const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
|
||||
float4_t result;
|
||||
result.uxyzw[0] = input[0];
|
||||
result.uxyzw[1] = input[1];
|
||||
result.uxyzw[2] = input[2];
|
||||
result.uxyzw[3] = input[3];
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
result[1] = _a.uxyzw[1];
|
||||
result[2] = _a.uxyzw[2];
|
||||
result[3] = _a.uxyzw[3];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
result[1] = _a.uxyzw[1];
|
||||
result[2] = _a.uxyzw[2];
|
||||
result[3] = _a.uxyzw[3];
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = { _x, _y, _z, _w };
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = { _x, _y, _z, _w };
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
|
||||
{
|
||||
const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
|
||||
float4_t result;
|
||||
result.vu = { val, val, val, val };
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
|
||||
{
|
||||
return float4_ld(_a, _a, _a, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
return float4_ild(_a, _a, _a, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
|
||||
{
|
||||
return float4_ild(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) );
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) );
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a)
|
||||
{
|
||||
const float4_t tmp = float4_ftoi(_a);
|
||||
const float4_t result = float4_itof(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf + _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf - _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf * _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf / _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
#if 0
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
result.vf = one / _a.vf;
|
||||
return result;
|
||||
}
|
||||
#endif // 0
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf[0] = sqrtf(_a.vf[0]);
|
||||
result.vf[1] = sqrtf(_a.vf[1]);
|
||||
result.vf[2] = sqrtf(_a.vf[2]);
|
||||
result.vf[3] = sqrtf(_a.vf[3]);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf[0] = 1.0f / sqrtf(_a.vf[0]);
|
||||
result.vf[1] = 1.0f / sqrtf(_a.vf[1]);
|
||||
result.vf[2] = 1.0f / sqrtf(_a.vf[2]);
|
||||
result.vf[3] = 1.0f / sqrtf(_a.vf[3]);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf == _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf < _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf <= _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf > _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf >= _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu & _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu & ~_b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu | _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu ^ _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vu = _a.vu << count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vu = _a.vu >> count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vi = _a.vi >> count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi == _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi < _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi > _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi + _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi - _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#define float4_rcp float4_rcp_ni
|
||||
#define float4_orx float4_orx_ni
|
||||
#define float4_orc float4_orc_ni
|
||||
@@ -479,4 +39,474 @@ IMPLEMENT_TEST(xyzw , 0xf);
|
||||
#define float4_imax float4_imax_ni
|
||||
#include "float4_ni.h"
|
||||
|
||||
namespace bx
|
||||
{
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
|
||||
{ \
|
||||
float4_t result; \
|
||||
result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
#include "float4_swizzle.inl"
|
||||
|
||||
#undef IMPLEMENT_SWIZZLE
|
||||
#undef ELEMw
|
||||
#undef ELEMz
|
||||
#undef ELEMy
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw, _mask) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
|
||||
{ \
|
||||
uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
|
||||
| ( (_test.uxyzw[2]>>31)<<2) \
|
||||
| ( (_test.uxyzw[1]>>31)<<1) \
|
||||
| ( _test.uxyzw[0]>>31) \
|
||||
; \
|
||||
return 0 != (tmp&(_mask) ); \
|
||||
} \
|
||||
\
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
|
||||
{ \
|
||||
uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
|
||||
| ( (_test.uxyzw[2]>>31)<<2) \
|
||||
| ( (_test.uxyzw[1]>>31)<<1) \
|
||||
| ( _test.uxyzw[0]>>31) \
|
||||
; \
|
||||
return (_mask) == (tmp&(_mask) ); \
|
||||
}
|
||||
|
||||
IMPLEMENT_TEST(x , 0x1);
|
||||
IMPLEMENT_TEST(y , 0x2);
|
||||
IMPLEMENT_TEST(xy , 0x3);
|
||||
IMPLEMENT_TEST(z , 0x4);
|
||||
IMPLEMENT_TEST(xz , 0x5);
|
||||
IMPLEMENT_TEST(yz , 0x6);
|
||||
IMPLEMENT_TEST(xyz , 0x7);
|
||||
IMPLEMENT_TEST(w , 0x8);
|
||||
IMPLEMENT_TEST(xw , 0x9);
|
||||
IMPLEMENT_TEST(yw , 0xa);
|
||||
IMPLEMENT_TEST(xyw , 0xb);
|
||||
IMPLEMENT_TEST(zw , 0xc);
|
||||
IMPLEMENT_TEST(xzw , 0xd);
|
||||
IMPLEMENT_TEST(yzw , 0xe);
|
||||
IMPLEMENT_TEST(xyzw , 0xf);
|
||||
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 5, 7, 2, 3);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 5, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAzC(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBwD(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[0];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[1];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[2];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a)
|
||||
{
|
||||
return _a.fxyzw[3];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
|
||||
{
|
||||
const uint32_t* input = reinterpret_cast<const uint32_t*>(_ptr);
|
||||
float4_t result;
|
||||
result.uxyzw[0] = input[0];
|
||||
result.uxyzw[1] = input[1];
|
||||
result.uxyzw[2] = input[2];
|
||||
result.uxyzw[3] = input[3];
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
result[1] = _a.uxyzw[1];
|
||||
result[2] = _a.uxyzw[2];
|
||||
result[3] = _a.uxyzw[3];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a)
|
||||
{
|
||||
uint32_t* result = reinterpret_cast<uint32_t*>(_ptr);
|
||||
result[0] = _a.uxyzw[0];
|
||||
result[1] = _a.uxyzw[1];
|
||||
result[2] = _a.uxyzw[2];
|
||||
result[3] = _a.uxyzw[3];
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = { _x, _y, _z, _w };
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = { _x, _y, _z, _w };
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
|
||||
{
|
||||
const uint32_t val = *reinterpret_cast<const uint32_t*>(_ptr);
|
||||
float4_t result;
|
||||
result.vu = { val, val, val, val };
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
|
||||
{
|
||||
return float4_ld(_a, _a, _a, _a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
return float4_ild(_a, _a, _a, _a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
|
||||
{
|
||||
return float4_ild(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) );
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) );
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a)
|
||||
{
|
||||
const float4_t tmp = float4_ftoi(_a);
|
||||
const float4_t result = float4_itof(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf + _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf - _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf * _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf = _a.vf / _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf[0] = sqrtf(_a.vf[0]);
|
||||
result.vf[1] = sqrtf(_a.vf[1]);
|
||||
result.vf[2] = sqrtf(_a.vf[2]);
|
||||
result.vf[3] = sqrtf(_a.vf[3]);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a)
|
||||
{
|
||||
float4_t result;
|
||||
result.vf[0] = 1.0f / sqrtf(_a.vf[0]);
|
||||
result.vf[1] = 1.0f / sqrtf(_a.vf[1]);
|
||||
result.vf[2] = 1.0f / sqrtf(_a.vf[2]);
|
||||
result.vf[3] = 1.0f / sqrtf(_a.vf[3]);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf == _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf < _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf <= _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf > _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vf >= _b.vf;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu & _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu & ~_b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu | _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vu = _a.vu ^ _b.vu;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vu = _a.vu << count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vu = _a.vu >> count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count)
|
||||
{
|
||||
float4_t result;
|
||||
const float4_t count = float4_isplat(_count);
|
||||
result.vi = _a.vi >> count.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi == _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi < _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi > _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi + _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
|
||||
{
|
||||
float4_t result;
|
||||
result.vi = _a.vi - _b.vi;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#endif // BX_FLOAT4_LANGEXT_H_HEADER_GUARD
|
||||
|
||||
@@ -6,447 +6,6 @@
|
||||
#ifndef BX_FLOAT4_NEON_H_HEADER_GUARD
|
||||
#define BX_FLOAT4_NEON_H_HEADER_GUARD
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef float32x4_t float4_t;
|
||||
|
||||
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
|
||||
{ \
|
||||
return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
|
||||
}
|
||||
|
||||
#include "float4_swizzle.inl"
|
||||
|
||||
#undef IMPLEMENT_SWIZZLE
|
||||
#undef ELEMw
|
||||
#undef ELEMz
|
||||
#undef ELEMy
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw, _swizzle) \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test); \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test);
|
||||
|
||||
IMPLEMENT_TEST(x , xxxx);
|
||||
IMPLEMENT_TEST(y , yyyy);
|
||||
IMPLEMENT_TEST(xy , xyyy);
|
||||
IMPLEMENT_TEST(z , zzzz);
|
||||
IMPLEMENT_TEST(xz , xzzz);
|
||||
IMPLEMENT_TEST(yz , yzzz);
|
||||
IMPLEMENT_TEST(xyz , xyzz);
|
||||
IMPLEMENT_TEST(w , wwww);
|
||||
IMPLEMENT_TEST(xw , xwww);
|
||||
IMPLEMENT_TEST(yw , ywww);
|
||||
IMPLEMENT_TEST(xyw , xyww);
|
||||
IMPLEMENT_TEST(zw , zwww);
|
||||
IMPLEMENT_TEST(xzw , xzww);
|
||||
IMPLEMENT_TEST(yzw , yzww);
|
||||
IMPLEMENT_TEST(xyzw , xyzw);
|
||||
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 0);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 1);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 2);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 3);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
|
||||
{
|
||||
return vld1q_f32( (const float32_t*)_ptr);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a)
|
||||
{
|
||||
vst1q_f32( (float32_t*)_ptr, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a)
|
||||
{
|
||||
vst1q_lane_f32( (float32_t*)_ptr, _a, 0);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a)
|
||||
{
|
||||
vst1q_f32( (float32_t*)_ptr, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
const float32_t val[4] = {_x, _y, _z, _w};
|
||||
return float4_ld(val);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
const uint32_t val[4] = {_x, _y, _z, _w};
|
||||
const uint32x4_t tmp = vld1q_u32(val);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
|
||||
{
|
||||
const float4_t tmp0 = vld1q_f32( (const float32_t*)_ptr);
|
||||
const float32x2_t tmp1 = vget_low_f32(tmp0);
|
||||
const float4_t result = vdupq_lane_f32(tmp1, 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
|
||||
{
|
||||
return vdupq_n_f32(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
const int32x4_t tmp = vdupq_n_s32(_a);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
|
||||
{
|
||||
return float4_isplat(0);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a)
|
||||
{
|
||||
const int32x4_t itof = vreinterpretq_s32_f32(_a);
|
||||
const float4_t result = vcvtq_f32_s32(itof);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a)
|
||||
{
|
||||
const int32x4_t ftoi = vcvtq_s32_f32(_a);
|
||||
const float4_t result = vreinterpretq_f32_s32(ftoi);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b)
|
||||
{
|
||||
return vaddq_f32(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
|
||||
{
|
||||
return vsubq_f32(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
|
||||
{
|
||||
return vmulq_f32(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a)
|
||||
{
|
||||
return vrecpeq_f32(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a)
|
||||
{
|
||||
return vrsqrteq_f32(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vceqq_f32(_a, _b);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcltq_f32(_a, _b);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcleq_f32(_a, _b);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcgtq_f32(_a, _b);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcgeq_f32(_a, _b);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b)
|
||||
{
|
||||
return vminq_f32(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b)
|
||||
{
|
||||
return vmaxq_f32(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vandq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = veorq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(_count);
|
||||
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(-_count);
|
||||
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(-_count);
|
||||
const int32x4_t tmp1 = vshlq_s32(tmp0, shift);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_madd(float4_t _a, float4_t _b, float4_t _c)
|
||||
{
|
||||
return vmlaq_f32(_c, _a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_nmsub(float4_t _a, float4_t _b, float4_t _c)
|
||||
{
|
||||
return vmlsq_f32(_c, _a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vminq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1);
|
||||
const float4_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#define float4_shuf_xAzC float4_shuf_xAzC_ni
|
||||
#define float4_shuf_yBwD float4_shuf_yBwD_ni
|
||||
#define float4_rcp float4_rcp_ni
|
||||
#define float4_orx float4_orx_ni
|
||||
#define float4_orc float4_orc_ni
|
||||
@@ -480,45 +39,524 @@ IMPLEMENT_TEST(xyzw , xyzw);
|
||||
|
||||
namespace bx
|
||||
{
|
||||
#define IMPLEMENT_TEST(_xyzw, _swizzle) \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_swiz_##_x##_y##_z##_w(float4_neon_t _a) \
|
||||
{ \
|
||||
const float4_t tmp0 = float4_swiz_##_swizzle(_test); \
|
||||
return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
|
||||
}
|
||||
|
||||
#include "float4_swizzle.inl"
|
||||
|
||||
#undef IMPLEMENT_SWIZZLE
|
||||
#undef ELEMw
|
||||
#undef ELEMz
|
||||
#undef ELEMy
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw, _swizzle) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_neon_t _test) \
|
||||
{ \
|
||||
const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \
|
||||
return float4_test_any_ni(tmp0); \
|
||||
} \
|
||||
\
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_neon_t _test) \
|
||||
{ \
|
||||
const float4_t tmp0 = float4_swiz_##_swizzle(_test); \
|
||||
const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \
|
||||
return float4_test_all_ni(tmp0); \
|
||||
}
|
||||
|
||||
IMPLEMENT_TEST(x , xxxx);
|
||||
IMPLEMENT_TEST(y , yyyy);
|
||||
IMPLEMENT_TEST(xy , xyyy);
|
||||
IMPLEMENT_TEST(z , zzzz);
|
||||
IMPLEMENT_TEST(xz , xzzz);
|
||||
IMPLEMENT_TEST(yz , yzzz);
|
||||
IMPLEMENT_TEST(xyz , xyzz);
|
||||
IMPLEMENT_TEST(w , wwww);
|
||||
IMPLEMENT_TEST(xw , xwww);
|
||||
IMPLEMENT_TEST(yw , ywww);
|
||||
IMPLEMENT_TEST(xyw , xyww);
|
||||
IMPLEMENT_TEST(zw , zwww);
|
||||
IMPLEMENT_TEST(xzw , xzww);
|
||||
IMPLEMENT_TEST(yzw , yzww);
|
||||
IMPLEMENT_TEST(x, xxxx);
|
||||
IMPLEMENT_TEST(y, yyyy);
|
||||
IMPLEMENT_TEST(xy, xyyy);
|
||||
IMPLEMENT_TEST(z, zzzz);
|
||||
IMPLEMENT_TEST(xz, xzzz);
|
||||
IMPLEMENT_TEST(yz, yzzz);
|
||||
IMPLEMENT_TEST(xyz, xyzz);
|
||||
IMPLEMENT_TEST(w, wwww);
|
||||
IMPLEMENT_TEST(xw, xwww);
|
||||
IMPLEMENT_TEST(yw, ywww);
|
||||
IMPLEMENT_TEST(xyw, xyww);
|
||||
IMPLEMENT_TEST(zw, zwww);
|
||||
IMPLEMENT_TEST(xzw, xzww);
|
||||
IMPLEMENT_TEST(yzw, yzww);
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_t _test)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_neon_t _test)
|
||||
{
|
||||
return float4_test_any_ni(_test);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_t _test)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_neon_t _test)
|
||||
{
|
||||
return float4_test_all_ni(_test);
|
||||
}
|
||||
|
||||
#undef IMPLEMENT_TEST
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xyAB(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_ABxy(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CDzw(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zwCD(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xAyB(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_yBxA(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zCwD(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CzDw(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_neon_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 0);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_neon_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 1);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_neon_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 2);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_neon_t _a)
|
||||
{
|
||||
return vgetq_lane_f32(_a, 3);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(const void* _ptr)
|
||||
{
|
||||
return vld1q_f32( (const float32_t*)_ptr);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_neon_t _a)
|
||||
{
|
||||
vst1q_f32( (float32_t*)_ptr, _a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_neon_t _a)
|
||||
{
|
||||
vst1q_lane_f32( (float32_t*)_ptr, _a, 0);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_neon_t _a)
|
||||
{
|
||||
vst1q_f32( (float32_t*)_ptr, _a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
const float32_t val[4] = {_x, _y, _z, _w};
|
||||
return float4_ld<float4_neon_t>(val);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
const uint32_t val[4] = {_x, _y, _z, _w};
|
||||
const uint32x4_t tmp = vld1q_u32(val);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(const void* _ptr)
|
||||
{
|
||||
const float4_neon_t tmp0 = vld1q_f32( (const float32_t*)_ptr);
|
||||
const float32x2_t tmp1 = vget_low_f32(tmp0);
|
||||
const float4_neon_t result = vdupq_lane_f32(tmp1, 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(float _a)
|
||||
{
|
||||
return vdupq_n_f32(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
const int32x4_t tmp = vdupq_n_s32(_a);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_zero()
|
||||
{
|
||||
return float4_isplat<float4_neon_t>(0);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_itof(float4_neon_t _a)
|
||||
{
|
||||
const int32x4_t itof = vreinterpretq_s32_f32(_a);
|
||||
const float4_neon_t result = vcvtq_f32_s32(itof);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ftoi(float4_neon_t _a)
|
||||
{
|
||||
const int32x4_t ftoi = vcvtq_s32_f32(_a);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(ftoi);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_add(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return vaddq_f32(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sub(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return vsubq_f32(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_mul(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return vmulq_f32(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rcp_est(float4_neon_t _a)
|
||||
{
|
||||
return vrecpeq_f32(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rsqrt_est(float4_neon_t _a)
|
||||
{
|
||||
return vrsqrteq_f32(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpeq(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vceqq_f32(_a, _b);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmplt(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcltq_f32(_a, _b);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmple(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcleq_f32(_a, _b);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpgt(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcgtq_f32(_a, _b);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpge(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const uint32x4_t tmp = vcgeq_f32(_a, _b);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_min(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return vminq_f32(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_max(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return vmaxq_f32(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_and(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vandq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_andc(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_or(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_xor(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = veorq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sll(float4_neon_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(_count);
|
||||
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_srl(float4_neon_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(-_count);
|
||||
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sra(float4_neon_t _a, int _count)
|
||||
{
|
||||
if (__builtin_constant_p(_count) )
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t shift = vdupq_n_s32(-_count);
|
||||
const int32x4_t tmp1 = vshlq_s32(tmp0, shift);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_madd(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c)
|
||||
{
|
||||
return vmlaq_f32(_c, _a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_nmsub(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c)
|
||||
{
|
||||
return vmlsq_f32(_c, _a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpeq(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmplt(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpgt(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_u32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imin(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vminq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imax(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_iadd(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isub(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
||||
const int32x4_t tmp1 = vreinterpretq_s32_f32(_b);
|
||||
const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1);
|
||||
const float4_neon_t result = vreinterpretq_f32_s32(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_neon_t float4_shuf_xAzC(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return float4_shuf_xAzC_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_neon_t float4_shuf_yBwD(float4_neon_t _a, float4_neon_t _b)
|
||||
{
|
||||
return float4_shuf_yBwD_ni(_a, _b);
|
||||
}
|
||||
|
||||
typedef float4_neon_t float4_t;
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#endif // BX_FLOAT4_NEON_H_HEADER_GUARD
|
||||
|
||||
@@ -8,306 +8,334 @@
|
||||
|
||||
namespace bx
|
||||
{
|
||||
BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a);
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_shuf_xAzC_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t xAyB = float4_shuf_xAyB(_a, _b);
|
||||
const float4_t zCwD = float4_shuf_zCwD(_a, _b);
|
||||
const float4_t result = float4_shuf_xyAB(xAyB, zCwD);
|
||||
const Ty xAyB = float4_shuf_xAyB(_a, _b);
|
||||
const Ty zCwD = float4_shuf_zCwD(_a, _b);
|
||||
const Ty result = float4_shuf_xyAB(xAyB, zCwD);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_shuf_yBwD_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t xAyB = float4_shuf_xAyB(_a, _b);
|
||||
const float4_t zCwD = float4_shuf_zCwD(_a, _b);
|
||||
const float4_t result = float4_shuf_zwCD(xAyB, zCwD);
|
||||
const Ty xAyB = float4_shuf_xAyB(_a, _b);
|
||||
const Ty zCwD = float4_shuf_zCwD(_a, _b);
|
||||
const Ty result = float4_shuf_zwCD(xAyB, zCwD);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_madd_ni(Ty _a, Ty _b, Ty _c)
|
||||
{
|
||||
const float4_t mul = float4_mul(_a, _b);
|
||||
const float4_t result = float4_add(mul, _c);
|
||||
const Ty mul = float4_mul(_a, _b);
|
||||
const Ty result = float4_add(mul, _c);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_nmsub_ni(Ty _a, Ty _b, Ty _c)
|
||||
{
|
||||
const float4_t mul = float4_mul(_a, _b);
|
||||
const float4_t result = float4_sub(_c, mul);
|
||||
const Ty mul = float4_mul(_a, _b);
|
||||
const Ty result = float4_sub(_c, mul);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_div_nr_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t oneish = float4_isplat(0x3f800001);
|
||||
const float4_t est = float4_rcp_est(_b);
|
||||
const float4_t iter0 = float4_mul(_a, est);
|
||||
const float4_t tmp1 = float4_nmsub(_b, est, oneish);
|
||||
const float4_t result = float4_madd(tmp1, iter0, iter0);
|
||||
const Ty oneish = float4_isplat<Ty>(0x3f800001);
|
||||
const Ty est = float4_rcp_est(_b);
|
||||
const Ty iter0 = float4_mul(_a, est);
|
||||
const Ty tmp1 = float4_nmsub(_b, est, oneish);
|
||||
const Ty result = float4_madd(tmp1, iter0, iter0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rcp_ni(Ty _a)
|
||||
{
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const float4_t result = float4_div(one, _a);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
const Ty result = float4_div(one, _a);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_orx_ni(Ty _a)
|
||||
{
|
||||
const float4_t zwxy = float4_swiz_zwxy(_a);
|
||||
const float4_t tmp0 = float4_or(_a, zwxy);
|
||||
const float4_t tmp1 = float4_swiz_yyyy(_a);
|
||||
const float4_t tmp2 = float4_or(tmp0, tmp1);
|
||||
const float4_t mf000 = float4_ild(UINT32_MAX, 0, 0, 0);
|
||||
const float4_t result = float4_and(tmp2, mf000);
|
||||
const Ty zwxy = float4_swiz_zwxy(_a);
|
||||
const Ty tmp0 = float4_or(_a, zwxy);
|
||||
const Ty tmp1 = float4_swiz_yyyy(_a);
|
||||
const Ty tmp2 = float4_or(tmp0, tmp1);
|
||||
const Ty mf000 = float4_ild<Ty>(UINT32_MAX, 0, 0, 0);
|
||||
const Ty result = float4_and(tmp2, mf000);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_orc_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t aorb = float4_or(_a, _b);
|
||||
const float4_t mffff = float4_isplat(UINT32_MAX);
|
||||
const float4_t result = float4_xor(aorb, mffff);
|
||||
const Ty aorb = float4_or(_a, _b);
|
||||
const Ty mffff = float4_isplat<Ty>(UINT32_MAX);
|
||||
const Ty result = float4_xor(aorb, mffff);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_neg_ni(Ty _a)
|
||||
{
|
||||
const float4_t zero = float4_zero();
|
||||
const float4_t result = float4_sub(zero, _a);
|
||||
const Ty zero = float4_zero<Ty>();
|
||||
const Ty result = float4_sub(zero, _a);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_selb_ni(Ty _mask, Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t sel_a = float4_and(_a, _mask);
|
||||
const float4_t sel_b = float4_andc(_b, _mask);
|
||||
const float4_t result = float4_or(sel_a, sel_b);
|
||||
const Ty sel_a = float4_and(_a, _mask);
|
||||
const Ty sel_b = float4_andc(_b, _mask);
|
||||
const Ty result = float4_or(sel_a, sel_b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_sels_ni(Ty _test, Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t mask = float4_sra(_test, 31);
|
||||
const float4_t result = float4_selb(mask, _a, _b);
|
||||
const Ty mask = float4_sra(_test, 31);
|
||||
const Ty result = float4_selb(mask, _a, _b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_not_ni(Ty _a)
|
||||
{
|
||||
const float4_t mffff = float4_isplat(UINT32_MAX);
|
||||
const float4_t result = float4_xor(_a, mffff);
|
||||
const Ty mffff = float4_isplat<Ty>(UINT32_MAX);
|
||||
const Ty result = float4_xor(_a, mffff);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_min_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_min_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t mask = float4_cmplt(_a, _b);
|
||||
const float4_t result = float4_selb(mask, _a, _b);
|
||||
const Ty mask = float4_cmplt(_a, _b);
|
||||
const Ty result = float4_selb(mask, _a, _b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_max_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_max_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t mask = float4_cmpgt(_a, _b);
|
||||
const float4_t result = float4_selb(mask, _a, _b);
|
||||
const Ty mask = float4_cmpgt(_a, _b);
|
||||
const Ty result = float4_selb(mask, _a, _b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_abs_ni(Ty _a)
|
||||
{
|
||||
const float4_t a_neg = float4_neg(_a);
|
||||
const float4_t result = float4_max(a_neg, _a);
|
||||
const Ty a_neg = float4_neg(_a);
|
||||
const Ty result = float4_max(a_neg, _a);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_imin_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_imin_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t mask = float4_icmplt(_a, _b);
|
||||
const float4_t result = float4_selb(mask, _a, _b);
|
||||
const Ty mask = float4_icmplt(_a, _b);
|
||||
const Ty result = float4_selb(mask, _a, _b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_imax_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_imax_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t mask = float4_icmpgt(_a, _b);
|
||||
const float4_t result = float4_selb(mask, _a, _b);
|
||||
const Ty mask = float4_icmpgt(_a, _b);
|
||||
const Ty result = float4_selb(mask, _a, _b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_clamp_ni(Ty _a, Ty _min, Ty _max)
|
||||
{
|
||||
const float4_t tmp = float4_min(_a, _max);
|
||||
const float4_t result = float4_max(tmp, _min);
|
||||
const Ty tmp = float4_min(_a, _max);
|
||||
const Ty result = float4_max(tmp, _min);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_lerp_ni(Ty _a, Ty _b, Ty _s)
|
||||
{
|
||||
const float4_t ba = float4_sub(_b, _a);
|
||||
const float4_t result = float4_madd(_s, ba, _a);
|
||||
const Ty ba = float4_sub(_b, _a);
|
||||
const Ty result = float4_madd(_s, ba, _a);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_sqrt_nr_ni(Ty _a)
|
||||
{
|
||||
const float4_t half = float4_splat(0.5f);
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const float4_t tmp0 = float4_rsqrt_est(_a);
|
||||
const float4_t tmp1 = float4_mul(tmp0, _a);
|
||||
const float4_t tmp2 = float4_mul(tmp1, half);
|
||||
const float4_t tmp3 = float4_nmsub(tmp0, tmp1, one);
|
||||
const float4_t result = float4_madd(tmp3, tmp2, tmp1);
|
||||
const Ty half = float4_splat<Ty>(0.5f);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
const Ty tmp0 = float4_rsqrt_est(_a);
|
||||
const Ty tmp1 = float4_mul(tmp0, _a);
|
||||
const Ty tmp2 = float4_mul(tmp1, half);
|
||||
const Ty tmp3 = float4_nmsub(tmp0, tmp1, one);
|
||||
const Ty result = float4_madd(tmp3, tmp2, tmp1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_sqrt_nr1_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_sqrt_nr1_ni(Ty _a)
|
||||
{
|
||||
const float4_t half = float4_splat(0.5f);
|
||||
const Ty half = float4_splat<Ty>(0.5f);
|
||||
|
||||
float4_t result = _a;
|
||||
Ty result = _a;
|
||||
for (uint32_t ii = 0; ii < 11; ++ii)
|
||||
{
|
||||
const float4_t tmp1 = float4_div(_a, result);
|
||||
const float4_t tmp2 = float4_add(tmp1, result);
|
||||
const Ty tmp1 = float4_div(_a, result);
|
||||
const Ty tmp2 = float4_add(tmp1, result);
|
||||
result = float4_mul(tmp2, half);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt_ni(Ty _a)
|
||||
{
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const float4_t sqrt = float4_sqrt(_a);
|
||||
const float4_t result = float4_div(one, sqrt);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
const Ty sqrt = float4_sqrt(_a);
|
||||
const Ty result = float4_div(one, sqrt);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt_nr_ni(Ty _a)
|
||||
{
|
||||
const float4_t rsqrt = float4_rsqrt_est(_a);
|
||||
const float4_t iter0 = float4_mul(_a, rsqrt);
|
||||
const float4_t iter1 = float4_mul(iter0, rsqrt);
|
||||
const float4_t half = float4_splat(0.5f);
|
||||
const float4_t half_rsqrt = float4_mul(half, rsqrt);
|
||||
const float4_t three = float4_splat(3.0f);
|
||||
const float4_t three_sub_iter1 = float4_sub(three, iter1);
|
||||
const float4_t result = float4_mul(half_rsqrt, three_sub_iter1);
|
||||
const Ty rsqrt = float4_rsqrt_est(_a);
|
||||
const Ty iter0 = float4_mul(_a, rsqrt);
|
||||
const Ty iter1 = float4_mul(iter0, rsqrt);
|
||||
const Ty half = float4_splat<Ty>(0.5f);
|
||||
const Ty half_rsqrt = float4_mul(half, rsqrt);
|
||||
const Ty three = float4_splat<Ty>(3.0f);
|
||||
const Ty three_sub_iter1 = float4_sub(three, iter1);
|
||||
const Ty result = float4_mul(half_rsqrt, three_sub_iter1);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt_carmack_ni(Ty _a)
|
||||
{
|
||||
const float4_t half = float4_splat(0.5f);
|
||||
const float4_t ah = float4_mul(half, _a);
|
||||
const float4_t ashift = float4_sra(_a, 1);
|
||||
const float4_t magic = float4_isplat(0x5f3759df);
|
||||
const float4_t msuba = float4_isub(magic, ashift);
|
||||
const float4_t msubasq = float4_mul(msuba, msuba);
|
||||
const float4_t tmp0 = float4_splat(1.5f);
|
||||
const float4_t tmp1 = float4_mul(ah, msubasq);
|
||||
const float4_t tmp2 = float4_sub(tmp0, tmp1);
|
||||
const float4_t result = float4_mul(msuba, tmp2);
|
||||
const Ty half = float4_splat<Ty>(0.5f);
|
||||
const Ty ah = float4_mul(half, _a);
|
||||
const Ty ashift = float4_sra(_a, 1);
|
||||
const Ty magic = float4_isplat<Ty>(0x5f3759df);
|
||||
const Ty msuba = float4_isub(magic, ashift);
|
||||
const Ty msubasq = float4_mul(msuba, msuba);
|
||||
const Ty tmp0 = float4_splat<Ty>(1.5f);
|
||||
const Ty tmp1 = float4_mul(ah, msubasq);
|
||||
const Ty tmp2 = float4_sub(tmp0, tmp1);
|
||||
const Ty result = float4_mul(msuba, tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace float4_logexp_detail
|
||||
{
|
||||
BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_poly1(Ty _a, float _b, float _c)
|
||||
{
|
||||
const float4_t bbbb = float4_splat(_b);
|
||||
const float4_t cccc = float4_splat(_c);
|
||||
const float4_t result = float4_madd(cccc, _a, bbbb);
|
||||
const Ty bbbb = float4_splat<Ty>(_b);
|
||||
const Ty cccc = float4_splat<Ty>(_c);
|
||||
const Ty result = float4_madd(cccc, _a, bbbb);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_poly2(Ty _a, float _b, float _c, float _d)
|
||||
{
|
||||
const float4_t bbbb = float4_splat(_b);
|
||||
const float4_t poly = float4_poly1(_a, _c, _d);
|
||||
const float4_t result = float4_madd(poly, _a, bbbb);
|
||||
const Ty bbbb = float4_splat<Ty>(_b);
|
||||
const Ty poly = float4_poly1(_a, _c, _d);
|
||||
const Ty result = float4_madd(poly, _a, bbbb);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_poly3(Ty _a, float _b, float _c, float _d, float _e)
|
||||
{
|
||||
const float4_t bbbb = float4_splat(_b);
|
||||
const float4_t poly = float4_poly2(_a, _c, _d, _e);
|
||||
const float4_t result = float4_madd(poly, _a, bbbb);
|
||||
const Ty bbbb = float4_splat<Ty>(_b);
|
||||
const Ty poly = float4_poly2(_a, _c, _d, _e);
|
||||
const Ty result = float4_madd(poly, _a, bbbb);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f)
|
||||
{
|
||||
const float4_t bbbb = float4_splat(_b);
|
||||
const float4_t poly = float4_poly3(_a, _c, _d, _e, _f);
|
||||
const float4_t result = float4_madd(poly, _a, bbbb);
|
||||
const Ty bbbb = float4_splat<Ty>(_b);
|
||||
const Ty poly = float4_poly3(_a, _c, _d, _e, _f);
|
||||
const Ty result = float4_madd(poly, _a, bbbb);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g)
|
||||
{
|
||||
const float4_t bbbb = float4_splat(_b);
|
||||
const float4_t poly = float4_poly4(_a, _c, _d, _e, _f, _g);
|
||||
const float4_t result = float4_madd(poly, _a, bbbb);
|
||||
const Ty bbbb = float4_splat<Ty>(_b);
|
||||
const Ty poly = float4_poly4(_a, _c, _d, _e, _f, _g);
|
||||
const Ty result = float4_madd(poly, _a, bbbb);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_logpoly(Ty _a)
|
||||
{
|
||||
#if 1
|
||||
const float4_t result = float4_poly5(_a
|
||||
const Ty result = float4_poly5(_a
|
||||
, 3.11578814719469302614f, -3.32419399085241980044f
|
||||
, 2.59883907202499966007f, -1.23152682416275988241f
|
||||
, 0.318212422185251071475f, -0.0344359067839062357313f
|
||||
);
|
||||
#elif 0
|
||||
const float4_t result = float4_poly4(_a
|
||||
const Ty result = float4_poly4(_a
|
||||
, 2.8882704548164776201f, -2.52074962577807006663f
|
||||
, 1.48116647521213171641f, -0.465725644288844778798f
|
||||
, 0.0596515482674574969533f
|
||||
);
|
||||
#elif 0
|
||||
const float4_t result = float4_poly3(_a
|
||||
const Ty result = float4_poly3(_a
|
||||
, 2.61761038894603480148f, -1.75647175389045657003f
|
||||
, 0.688243882994381274313f, -0.107254423828329604454f
|
||||
);
|
||||
#else
|
||||
const float4_t result = float4_poly2(_a
|
||||
const Ty result = float4_poly2(_a
|
||||
, 2.28330284476918490682f, -1.04913055217340124191f
|
||||
, 0.204446009836232697516f
|
||||
);
|
||||
@@ -316,27 +344,28 @@ namespace bx
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_exppoly(Ty _a)
|
||||
{
|
||||
#if 1
|
||||
const float4_t result = float4_poly5(_a
|
||||
const Ty result = float4_poly5(_a
|
||||
, 9.9999994e-1f, 6.9315308e-1f
|
||||
, 2.4015361e-1f, 5.5826318e-2f
|
||||
, 8.9893397e-3f, 1.8775767e-3f
|
||||
);
|
||||
#elif 0
|
||||
const float4_t result = float4_poly4(_a
|
||||
const Ty result = float4_poly4(_a
|
||||
, 1.0000026f, 6.9300383e-1f
|
||||
, 2.4144275e-1f, 5.2011464e-2f
|
||||
, 1.3534167e-2f
|
||||
);
|
||||
#elif 0
|
||||
const float4_t result = float4_poly3(_a
|
||||
const Ty result = float4_poly3(_a
|
||||
, 9.9992520e-1f, 6.9583356e-1f
|
||||
, 2.2606716e-1f, 7.8024521e-2f
|
||||
);
|
||||
#else
|
||||
const float4_t result = float4_poly2(_a
|
||||
const Ty result = float4_poly2(_a
|
||||
, 1.0017247f, 6.5763628e-1f
|
||||
, 3.3718944e-1f
|
||||
);
|
||||
@@ -346,159 +375,179 @@ namespace bx
|
||||
}
|
||||
} // namespace float4_internal
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_log2_ni(Ty _a)
|
||||
{
|
||||
const float4_t expmask = float4_isplat(0x7f800000);
|
||||
const float4_t mantmask = float4_isplat(0x007fffff);
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const Ty expmask = float4_isplat<Ty>(0x7f800000);
|
||||
const Ty mantmask = float4_isplat<Ty>(0x007fffff);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
|
||||
const float4_t c127 = float4_isplat(127);
|
||||
const float4_t aexp = float4_and(_a, expmask);
|
||||
const float4_t aexpsr = float4_srl(aexp, 23);
|
||||
const float4_t tmp0 = float4_isub(aexpsr, c127);
|
||||
const float4_t exp = float4_itof(tmp0);
|
||||
const Ty c127 = float4_isplat<Ty>(127);
|
||||
const Ty aexp = float4_and(_a, expmask);
|
||||
const Ty aexpsr = float4_srl(aexp, 23);
|
||||
const Ty tmp0 = float4_isub(aexpsr, c127);
|
||||
const Ty exp = float4_itof(tmp0);
|
||||
|
||||
const float4_t amask = float4_and(_a, mantmask);
|
||||
const float4_t mant = float4_or(amask, one);
|
||||
const Ty amask = float4_and(_a, mantmask);
|
||||
const Ty mant = float4_or(amask, one);
|
||||
|
||||
const float4_t poly = float4_logexp_detail::float4_logpoly(mant);
|
||||
const Ty poly = float4_logexp_detail::float4_logpoly(mant);
|
||||
|
||||
const float4_t mandiff = float4_sub(mant, one);
|
||||
const float4_t result = float4_madd(poly, mandiff, exp);
|
||||
const Ty mandiff = float4_sub(mant, one);
|
||||
const Ty result = float4_madd(poly, mandiff, exp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_exp2_ni(Ty _a)
|
||||
{
|
||||
const float4_t min = float4_splat( 129.0f);
|
||||
const float4_t max = float4_splat(-126.99999f);
|
||||
const float4_t tmp0 = float4_min(_a, min);
|
||||
const float4_t aaaa = float4_max(tmp0, max);
|
||||
const Ty min = float4_splat<Ty>( 129.0f);
|
||||
const Ty max = float4_splat<Ty>(-126.99999f);
|
||||
const Ty tmp0 = float4_min(_a, min);
|
||||
const Ty aaaa = float4_max(tmp0, max);
|
||||
|
||||
const float4_t half = float4_splat(0.5f);
|
||||
const float4_t tmp2 = float4_sub(aaaa, half);
|
||||
const float4_t ipart = float4_ftoi(tmp2);
|
||||
const float4_t iround = float4_itof(ipart);
|
||||
const float4_t fpart = float4_sub(aaaa, iround);
|
||||
const Ty half = float4_splat<Ty>(0.5f);
|
||||
const Ty tmp2 = float4_sub(aaaa, half);
|
||||
const Ty ipart = float4_ftoi(tmp2);
|
||||
const Ty iround = float4_itof(ipart);
|
||||
const Ty fpart = float4_sub(aaaa, iround);
|
||||
|
||||
const float4_t c127 = float4_isplat(127);
|
||||
const float4_t tmp5 = float4_iadd(ipart, c127);
|
||||
const float4_t expipart = float4_sll(tmp5, 23);
|
||||
const Ty c127 = float4_isplat<Ty>(127);
|
||||
const Ty tmp5 = float4_iadd(ipart, c127);
|
||||
const Ty expipart = float4_sll(tmp5, 23);
|
||||
|
||||
const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart);
|
||||
const Ty expfpart = float4_logexp_detail::float4_exppoly(fpart);
|
||||
|
||||
const float4_t result = float4_mul(expipart, expfpart);
|
||||
const Ty result = float4_mul(expipart, expfpart);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_pow_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t alog2 = float4_log2(_a);
|
||||
const float4_t alog2b = float4_mul(alog2, _b);
|
||||
const float4_t result = float4_exp2(alog2b);
|
||||
const Ty alog2 = float4_log2(_a);
|
||||
const Ty alog2b = float4_mul(alog2, _b);
|
||||
const Ty result = float4_exp2(alog2b);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_dot3_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t xyzw = float4_mul(_a, _b);
|
||||
const float4_t xxxx = float4_swiz_xxxx(xyzw);
|
||||
const float4_t yyyy = float4_swiz_yyyy(xyzw);
|
||||
const float4_t zzzz = float4_swiz_zzzz(xyzw);
|
||||
const float4_t tmp1 = float4_add(xxxx, yyyy);
|
||||
const float4_t result = float4_add(zzzz, tmp1);
|
||||
const Ty xyzw = float4_mul(_a, _b);
|
||||
const Ty xxxx = float4_swiz_xxxx(xyzw);
|
||||
const Ty yyyy = float4_swiz_yyyy(xyzw);
|
||||
const Ty zzzz = float4_swiz_zzzz(xyzw);
|
||||
const Ty tmp1 = float4_add(xxxx, yyyy);
|
||||
const Ty result = float4_add(zzzz, tmp1);
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_cross3_ni(Ty _a, Ty _b)
|
||||
{
|
||||
// a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx
|
||||
#if 0
|
||||
const float4_t a_yzxw = float4_swiz_yzxw(_a);
|
||||
const float4_t a_zxyw = float4_swiz_zxyw(_a);
|
||||
const float4_t b_zxyw = float4_swiz_zxyw(_b);
|
||||
const float4_t b_yzxw = float4_swiz_yzxw(_b);
|
||||
const float4_t tmp = float4_mul(a_yzxw, b_zxyw);
|
||||
const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp);
|
||||
const Ty a_yzxw = float4_swiz_yzxw(_a);
|
||||
const Ty a_zxyw = float4_swiz_zxyw(_a);
|
||||
const Ty b_zxyw = float4_swiz_zxyw(_b);
|
||||
const Ty b_yzxw = float4_swiz_yzxw(_b);
|
||||
const Ty tmp = float4_mul(a_yzxw, b_zxyw);
|
||||
const Ty result = float4_nmsub(a_zxyw, b_yzxw, tmp);
|
||||
#else
|
||||
const float4_t a_yzxw = float4_swiz_yzxw(_a);
|
||||
const float4_t b_yzxw = float4_swiz_yzxw(_b);
|
||||
const float4_t tmp0 = float4_mul(_a, b_yzxw);
|
||||
const float4_t tmp1 = float4_nmsub(a_yzxw, _b, tmp0);
|
||||
const float4_t result = float4_swiz_yzxw(tmp1);
|
||||
const Ty a_yzxw = float4_swiz_yzxw(_a);
|
||||
const Ty b_yzxw = float4_swiz_yzxw(_b);
|
||||
const Ty tmp0 = float4_mul(_a, b_yzxw);
|
||||
const Ty tmp1 = float4_nmsub(a_yzxw, _b, tmp0);
|
||||
const Ty result = float4_swiz_yzxw(tmp1);
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_normalize3_ni(Ty _a)
|
||||
{
|
||||
const float4_t dot3 = float4_dot3(_a, _a);
|
||||
const float4_t invSqrt = float4_rsqrt(dot3);
|
||||
const float4_t result = float4_mul(_a, invSqrt);
|
||||
const Ty dot3 = float4_dot3(_a, _a);
|
||||
const Ty invSqrt = float4_rsqrt(dot3);
|
||||
const Ty result = float4_mul(_a, invSqrt);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_dot_ni(Ty _a, Ty _b)
|
||||
{
|
||||
const float4_t xyzw = float4_mul(_a, _b);
|
||||
const float4_t yzwx = float4_swiz_yzwx(xyzw);
|
||||
const float4_t tmp0 = float4_add(xyzw, yzwx);
|
||||
const float4_t zwxy = float4_swiz_zwxy(tmp0);
|
||||
const float4_t result = float4_add(tmp0, zwxy);
|
||||
const Ty xyzw = float4_mul(_a, _b);
|
||||
const Ty yzwx = float4_swiz_yzwx(xyzw);
|
||||
const Ty tmp0 = float4_add(xyzw, yzwx);
|
||||
const Ty zwxy = float4_swiz_zwxy(tmp0);
|
||||
const Ty result = float4_add(tmp0, zwxy);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_ceil_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_ceil_ni(Ty _a)
|
||||
{
|
||||
const float4_t tmp0 = float4_ftoi(_a);
|
||||
const float4_t tmp1 = float4_itof(tmp0);
|
||||
const float4_t mask = float4_cmplt(tmp1, _a);
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const float4_t tmp2 = float4_and(one, mask);
|
||||
const float4_t result = float4_add(tmp1, tmp2);
|
||||
const Ty tmp0 = float4_ftoi(_a);
|
||||
const Ty tmp1 = float4_itof(tmp0);
|
||||
const Ty mask = float4_cmplt(tmp1, _a);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
const Ty tmp2 = float4_and(one, mask);
|
||||
const Ty result = float4_add(tmp1, tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE float4_t float4_floor_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_floor_ni(Ty _a)
|
||||
{
|
||||
const float4_t tmp0 = float4_ftoi(_a);
|
||||
const float4_t tmp1 = float4_itof(tmp0);
|
||||
const float4_t mask = float4_cmpgt(tmp1, _a);
|
||||
const float4_t one = float4_splat(1.0f);
|
||||
const float4_t tmp2 = float4_and(one, mask);
|
||||
const float4_t result = float4_sub(tmp1, tmp2);
|
||||
const Ty tmp0 = float4_ftoi(_a);
|
||||
const Ty tmp1 = float4_itof(tmp0);
|
||||
const Ty mask = float4_cmpgt(tmp1, _a);
|
||||
const Ty one = float4_splat<Ty>(1.0f);
|
||||
const Ty tmp2 = float4_and(one, mask);
|
||||
const Ty result = float4_sub(tmp1, tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE bool float4_test_any_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_round_ni(Ty _a)
|
||||
{
|
||||
const float4_t mask = float4_sra(_a, 31);
|
||||
const float4_t zwxy = float4_swiz_zwxy(mask);
|
||||
const float4_t tmp0 = float4_or(mask, zwxy);
|
||||
const float4_t tmp1 = float4_swiz_yyyy(tmp0);
|
||||
const float4_t tmp2 = float4_or(tmp0, tmp1);
|
||||
const Ty tmp = float4_ftoi(_a);
|
||||
const Ty result = float4_itof(tmp);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE bool float4_test_any_ni(Ty _a)
|
||||
{
|
||||
const Ty mask = float4_sra(_a, 31);
|
||||
const Ty zwxy = float4_swiz_zwxy(mask);
|
||||
const Ty tmp0 = float4_or(mask, zwxy);
|
||||
const Ty tmp1 = float4_swiz_yyyy(tmp0);
|
||||
const Ty tmp2 = float4_or(tmp0, tmp1);
|
||||
int res;
|
||||
float4_stx(&res, tmp2);
|
||||
return 0 != res;
|
||||
}
|
||||
|
||||
BX_FLOAT4_INLINE bool float4_test_all_ni(float4_t _a)
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE bool float4_test_all_ni(Ty _a)
|
||||
{
|
||||
const float4_t bits = float4_sra(_a, 31);
|
||||
const float4_t m1248 = float4_ild(1, 2, 4, 8);
|
||||
const float4_t mask = float4_and(bits, m1248);
|
||||
const float4_t zwxy = float4_swiz_zwxy(mask);
|
||||
const float4_t tmp0 = float4_or(mask, zwxy);
|
||||
const float4_t tmp1 = float4_swiz_yyyy(tmp0);
|
||||
const float4_t tmp2 = float4_or(tmp0, tmp1);
|
||||
const Ty bits = float4_sra(_a, 31);
|
||||
const Ty m1248 = float4_ild<Ty>(1, 2, 4, 8);
|
||||
const Ty mask = float4_and(bits, m1248);
|
||||
const Ty zwxy = float4_swiz_zwxy(mask);
|
||||
const Ty tmp0 = float4_or(mask, zwxy);
|
||||
const Ty tmp1 = float4_swiz_yyyy(tmp0);
|
||||
const Ty tmp2 = float4_or(tmp0, tmp1);
|
||||
int res;
|
||||
float4_stx(&res, tmp2);
|
||||
return 0xf == res;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,22 +6,17 @@
|
||||
#ifndef BX_FLOAT4_SSE_H_HEADER_GUARD
|
||||
#define BX_FLOAT4_SSE_H_HEADER_GUARD
|
||||
|
||||
#include <emmintrin.h> // __m128i
|
||||
#if defined(__SSE4_1__)
|
||||
# include <smmintrin.h>
|
||||
#endif // defined(__SSE4_1__)
|
||||
#include <xmmintrin.h> // __m128
|
||||
#include "float4_ni.h"
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef __m128 float4_t;
|
||||
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_swiz_##_x##_y##_z##_w(float4_sse_t _a) \
|
||||
{ \
|
||||
return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
|
||||
}
|
||||
@@ -35,12 +30,14 @@ namespace bx
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw, _mask) \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_sse_t _test) \
|
||||
{ \
|
||||
return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
|
||||
} \
|
||||
\
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
|
||||
template<> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_sse_t _test) \
|
||||
{ \
|
||||
return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
|
||||
}
|
||||
@@ -63,399 +60,588 @@ IMPLEMENT_TEST(xyzw , 0xf);
|
||||
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xyAB(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_movelh_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_ABxy(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_movelh_ps(_b, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CDzw(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_movehl_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zwCD(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_movehl_ps(_b, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xAyB(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_unpacklo_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_yBxA(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_unpacklo_ps(_b, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zCwD(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_unpackhi_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CzDw(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_unpackhi_ps(_b, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(float4_sse_t _a)
|
||||
{
|
||||
return _mm_cvtss_f32(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(float4_sse_t _a)
|
||||
{
|
||||
const float4_t yyyy = float4_swiz_yyyy(_a);
|
||||
const float4_sse_t yyyy = float4_swiz_yyyy(_a);
|
||||
const float result = _mm_cvtss_f32(yyyy);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(float4_sse_t _a)
|
||||
{
|
||||
const float4_t zzzz = float4_swiz_zzzz(_a);
|
||||
const float4_sse_t zzzz = float4_swiz_zzzz(_a);
|
||||
const float result = _mm_cvtss_f32(zzzz);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(float4_sse_t _a)
|
||||
{
|
||||
const float4_t wwww = float4_swiz_wwww(_a);
|
||||
const float4_sse_t wwww = float4_swiz_wwww(_a);
|
||||
const float result = _mm_cvtss_f32(wwww);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(const void* _ptr)
|
||||
{
|
||||
return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_sse_t _a)
|
||||
{
|
||||
_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_sse_t _a)
|
||||
{
|
||||
_mm_store_ss(reinterpret_cast<float*>(_ptr), _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_sse_t _a)
|
||||
{
|
||||
_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
return _mm_set_ps(_w, _z, _y, _x);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
const __m128i set = _mm_set_epi32(_w, _z, _y, _x);
|
||||
const float4_t result = _mm_castsi128_ps(set);
|
||||
|
||||
const float4_sse_t result = _mm_castsi128_ps(set);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(const void* _ptr)
|
||||
{
|
||||
const float4_t x___ = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
|
||||
const float4_t result = float4_swiz_xxxx(x___);
|
||||
const float4_sse_t x___ = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
|
||||
const float4_sse_t result = float4_swiz_xxxx(x___);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(float _a)
|
||||
{
|
||||
return _mm_set1_ps(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
const __m128i splat = _mm_set1_epi32(_a);
|
||||
const float4_t result = _mm_castsi128_ps(splat);
|
||||
const float4_sse_t result = _mm_castsi128_ps(splat);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_zero()
|
||||
{
|
||||
return _mm_setzero_ps();
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_itof(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_itof(float4_sse_t _a)
|
||||
{
|
||||
const __m128i itof = _mm_castps_si128(_a);
|
||||
const float4_t result = _mm_cvtepi32_ps(itof);
|
||||
const float4_sse_t result = _mm_cvtepi32_ps(itof);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ftoi(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ftoi(float4_sse_t _a)
|
||||
{
|
||||
const __m128i ftoi = _mm_cvtps_epi32(_a);
|
||||
const float4_t result = _mm_castsi128_ps(ftoi);
|
||||
const float4_sse_t result = _mm_castsi128_ps(ftoi);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_round(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_round(float4_sse_t _a)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_round_ps(_a, _MM_FROUND_NINT);
|
||||
#else
|
||||
const __m128i round = _mm_cvtps_epi32(_a);
|
||||
const float4_t result = _mm_cvtepi32_ps(round);
|
||||
const float4_sse_t result = _mm_cvtepi32_ps(round);
|
||||
|
||||
return result;
|
||||
#endif // defined(__SSE4_1__)
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_add(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_add(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_add_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sub(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_sub_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_mul(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_mul_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_div(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_div(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_div_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rcp_est(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rcp_est(float4_sse_t _a)
|
||||
{
|
||||
return _mm_rcp_ps(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sqrt(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sqrt(float4_sse_t _a)
|
||||
{
|
||||
return _mm_sqrt_ps(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_rsqrt_est(float4_t _a)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rsqrt_est(float4_sse_t _a)
|
||||
{
|
||||
return _mm_rsqrt_ps(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot3(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_dot3(float4_t _a, float4_t _b)
|
||||
{
|
||||
return _mm_dp_ps(_a, _b, 0x77);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_dot(float4_t _a, float4_t _b)
|
||||
{
|
||||
return _mm_dp_ps(_a, _b, 0xFF);
|
||||
}
|
||||
#else
|
||||
return float4_dot3_ni(_a, _b);
|
||||
#endif // defined(__SSE4__)
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_dp_ps(_a, _b, 0xFF);
|
||||
#else
|
||||
return float4_dot_ni(_a, _b);
|
||||
#endif // defined(__SSE4__)
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpeq(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_cmpeq_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmplt(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_cmplt_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmple(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_cmple_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpgt(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_cmpgt_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpge(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_cmpge_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_min(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_min(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_min_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_max(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_max(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_max_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_and(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_and(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_and_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_andc(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_andnot_ps(_b, _a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_or(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_or(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_or_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_xor(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return _mm_xor_ps(_a, _b);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sll(float4_t _a, int _count)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sll(float4_sse_t _a, int _count)
|
||||
{
|
||||
const __m128i a = _mm_castps_si128(_a);
|
||||
const __m128i shift = _mm_slli_epi32(a, _count);
|
||||
const float4_t result = _mm_castsi128_ps(shift);
|
||||
const float4_sse_t result = _mm_castsi128_ps(shift);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_srl(float4_t _a, int _count)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_srl(float4_sse_t _a, int _count)
|
||||
{
|
||||
const __m128i a = _mm_castps_si128(_a);
|
||||
const __m128i shift = _mm_srli_epi32(a, _count);
|
||||
const float4_t result = _mm_castsi128_ps(shift);
|
||||
const float4_sse_t result = _mm_castsi128_ps(shift);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_sra(float4_t _a, int _count)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sra(float4_sse_t _a, int _count)
|
||||
{
|
||||
const __m128i a = _mm_castps_si128(_a);
|
||||
const __m128i shift = _mm_srai_epi32(a, _count);
|
||||
const float4_t result = _mm_castsi128_ps(shift);
|
||||
const float4_sse_t result = _mm_castsi128_ps(shift);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpeq(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpeq(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
const __m128i tmp0 = _mm_castps_si128(_a);
|
||||
const __m128i tmp1 = _mm_castps_si128(_b);
|
||||
const __m128i tmp2 = _mm_cmpeq_epi32(tmp0, tmp1);
|
||||
const float4_t result = _mm_castsi128_ps(tmp2);
|
||||
const float4_sse_t result = _mm_castsi128_ps(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmplt(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmplt(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
const __m128i tmp0 = _mm_castps_si128(_a);
|
||||
const __m128i tmp1 = _mm_castps_si128(_b);
|
||||
const __m128i tmp2 = _mm_cmplt_epi32(tmp0, tmp1);
|
||||
const float4_t result = _mm_castsi128_ps(tmp2);
|
||||
const float4_sse_t result = _mm_castsi128_ps(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_icmpgt(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpgt(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
const __m128i tmp0 = _mm_castps_si128(_a);
|
||||
const __m128i tmp1 = _mm_castps_si128(_b);
|
||||
const __m128i tmp2 = _mm_cmpgt_epi32(tmp0, tmp1);
|
||||
const float4_t result = _mm_castsi128_ps(tmp2);
|
||||
const float4_sse_t result = _mm_castsi128_ps(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_imin(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imin(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
const __m128i tmp0 = _mm_castps_si128(_a);
|
||||
const __m128i tmp1 = _mm_castps_si128(_b);
|
||||
const __m128i tmp2 = _mm_min_epi32(tmp0, tmp1);
|
||||
const float4_t result = _mm_castsi128_ps(tmp2);
|
||||
const float4_sse_t result = _mm_castsi128_ps(tmp2);
|
||||
|
||||
return result;
|
||||
#else
|
||||
return float4_imin_ni(_a, _b);
|
||||
#endif // defined(__SSE4_1__)
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_imax(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imax(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
const __m128i tmp0 = _mm_castps_si128(_a);
|
||||
const __m128i tmp1 = _mm_castps_si128(_b);
|
||||
const __m128i tmp2 = _mm_max_epi32(tmp0, tmp1);
|
||||
const float4_t result = _mm_castsi128_ps(tmp2);
|
||||
const float4_sse_t result = _mm_castsi128_ps(tmp2);
|
||||
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
return float4_imax_ni(_a, _b);
|
||||
#endif // defined(__SSE4_1__)
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_iadd(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
const __m128i a = _mm_castps_si128(_a);
|
||||
const __m128i b = _mm_castps_si128(_b);
|
||||
const __m128i add = _mm_add_epi32(a, b);
|
||||
const float4_t result = _mm_castsi128_ps(add);
|
||||
const float4_sse_t result = _mm_castsi128_ps(add);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
|
||||
template<>
|
||||
BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isub(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
const __m128i a = _mm_castps_si128(_a);
|
||||
const __m128i b = _mm_castps_si128(_b);
|
||||
const __m128i sub = _mm_sub_epi32(a, b);
|
||||
const float4_t result = _mm_castsi128_ps(sub);
|
||||
const float4_sse_t result = _mm_castsi128_ps(sub);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_shuf_xAzC(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_shuf_xAzC_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_shuf_yBwD(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_shuf_yBwD_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_rcp(float4_sse_t _a)
|
||||
{
|
||||
return float4_rcp_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_orx(float4_sse_t _a)
|
||||
{
|
||||
return float4_orx_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_orc(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_orc_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_neg(float4_sse_t _a)
|
||||
{
|
||||
return float4_neg_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_madd(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c)
|
||||
{
|
||||
return float4_madd_ni(_a, _b, _c);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_nmsub(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c)
|
||||
{
|
||||
return float4_nmsub_ni(_a, _b, _c);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_div_nr(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_div_nr_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_selb(float4_sse_t _mask, float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_selb_ni(_mask, _a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_sels(float4_sse_t _test, float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_sels_ni(_test, _a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_not(float4_sse_t _a)
|
||||
{
|
||||
return float4_not_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_abs(float4_sse_t _a)
|
||||
{
|
||||
return float4_abs_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_clamp(float4_sse_t _a, float4_sse_t _min, float4_sse_t _max)
|
||||
{
|
||||
return float4_clamp_ni(_a, _min, _max);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_lerp(float4_sse_t _a, float4_sse_t _b, float4_sse_t _s)
|
||||
{
|
||||
return float4_lerp_ni(_a, _b, _s);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_rsqrt(float4_sse_t _a)
|
||||
{
|
||||
return float4_rsqrt_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_nr(float4_sse_t _a)
|
||||
{
|
||||
return float4_rsqrt_nr_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_carmack(float4_sse_t _a)
|
||||
{
|
||||
return float4_rsqrt_carmack_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_sqrt_nr(float4_sse_t _a)
|
||||
{
|
||||
return float4_sqrt_nr_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_log2(float4_sse_t _a)
|
||||
{
|
||||
return float4_log2_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_exp2(float4_sse_t _a)
|
||||
{
|
||||
return float4_exp2_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_pow(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_pow_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_cross3(float4_sse_t _a, float4_sse_t _b)
|
||||
{
|
||||
return float4_cross3_ni(_a, _b);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_normalize3(float4_sse_t _a)
|
||||
{
|
||||
return float4_normalize3_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_ceil(float4_sse_t _a)
|
||||
{
|
||||
return float4_ceil_ni(_a);
|
||||
}
|
||||
|
||||
template<>
|
||||
BX_FLOAT4_INLINE float4_sse_t float4_floor(float4_sse_t _a)
|
||||
{
|
||||
return float4_floor_ni(_a);
|
||||
}
|
||||
|
||||
typedef float4_sse_t float4_t;
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#define float4_shuf_xAzC float4_shuf_xAzC_ni
|
||||
#define float4_shuf_yBwD float4_shuf_yBwD_ni
|
||||
#define float4_rcp float4_rcp_ni
|
||||
#define float4_orx float4_orx_ni
|
||||
#define float4_orc float4_orc_ni
|
||||
#define float4_neg float4_neg_ni
|
||||
#define float4_madd float4_madd_ni
|
||||
#define float4_nmsub float4_nmsub_ni
|
||||
#define float4_div_nr float4_div_nr_ni
|
||||
#define float4_selb float4_selb_ni
|
||||
#define float4_sels float4_sels_ni
|
||||
#define float4_not float4_not_ni
|
||||
#define float4_abs float4_abs_ni
|
||||
#define float4_clamp float4_clamp_ni
|
||||
#define float4_lerp float4_lerp_ni
|
||||
#define float4_rsqrt float4_rsqrt_ni
|
||||
#define float4_rsqrt_nr float4_rsqrt_nr_ni
|
||||
#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
|
||||
#define float4_sqrt_nr float4_sqrt_nr_ni
|
||||
#define float4_log2 float4_log2_ni
|
||||
#define float4_exp2 float4_exp2_ni
|
||||
#define float4_pow float4_pow_ni
|
||||
#define float4_cross3 float4_cross3_ni
|
||||
#define float4_normalize3 float4_normalize3_ni
|
||||
#define float4_ceil float4_ceil_ni
|
||||
#define float4_floor float4_floor_ni
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
# define float4_dot3 float4_dot3_ni
|
||||
# define float4_dot float4_dot_ni
|
||||
# define float4_imin float4_imin_ni
|
||||
# define float4_imax float4_imax_ni
|
||||
#endif // defined(__SSE4_1__)
|
||||
|
||||
#include "float4_ni.h"
|
||||
|
||||
#endif // BX_FLOAT4_SSE_H_HEADER_GUARD
|
||||
|
||||
@@ -9,7 +9,347 @@
|
||||
#include "bx.h"
|
||||
|
||||
#define BX_FLOAT4_FORCE_INLINE BX_FORCE_INLINE
|
||||
#define BX_FLOAT4_INLINE static inline
|
||||
#define BX_FLOAT4_INLINE inline
|
||||
|
||||
#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
|
||||
# include <emmintrin.h> // __m128i
|
||||
# if defined(__SSE4_1__)
|
||||
# include <smmintrin.h>
|
||||
# endif // defined(__SSE4_1__)
|
||||
# include <xmmintrin.h> // __m128
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef __m128 float4_sse_t;
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG
|
||||
# include <arm_neon.h>
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef float32x4_t float4_neon_t;
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#elif BX_COMPILER_CLANG \
|
||||
&& !BX_PLATFORM_EMSCRIPTEN \
|
||||
&& !BX_PLATFORM_IOS \
|
||||
&& BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type)
|
||||
# include <math.h>
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef union float4_langext_t
|
||||
{
|
||||
float __attribute__((vector_size(16))) vf;
|
||||
int32_t __attribute__((vector_size(16))) vi;
|
||||
uint32_t __attribute__((vector_size(16))) vu;
|
||||
float fxyzw[4];
|
||||
int32_t ixyzw[4];
|
||||
uint32_t uxyzw[4];
|
||||
|
||||
} float4_langext_t;
|
||||
} // namespace bx
|
||||
#endif //
|
||||
|
||||
namespace bx
|
||||
{
|
||||
typedef union float4_ref_t
|
||||
{
|
||||
float fxyzw[4];
|
||||
int32_t ixyzw[4];
|
||||
uint32_t uxyzw[4];
|
||||
|
||||
} float4_ref_t;
|
||||
} // namespace bx
|
||||
|
||||
namespace bx
|
||||
{
|
||||
#define ELEMx 0
|
||||
#define ELEMy 1
|
||||
#define ELEMz 2
|
||||
#define ELEMw 3
|
||||
#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
|
||||
template<typename Ty> \
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_swiz_##_x##_y##_z##_w(Ty _a);
|
||||
#include "float4_swizzle.inl"
|
||||
|
||||
#undef IMPLEMENT_SWIZZLE
|
||||
#undef ELEMw
|
||||
#undef ELEMz
|
||||
#undef ELEMy
|
||||
#undef ELEMx
|
||||
|
||||
#define IMPLEMENT_TEST(_xyzw) \
|
||||
template<typename Ty> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(Ty _test); \
|
||||
\
|
||||
template<typename Ty> \
|
||||
BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(Ty _test)
|
||||
|
||||
IMPLEMENT_TEST(x );
|
||||
IMPLEMENT_TEST(y );
|
||||
IMPLEMENT_TEST(xy );
|
||||
IMPLEMENT_TEST(z );
|
||||
IMPLEMENT_TEST(xz );
|
||||
IMPLEMENT_TEST(yz );
|
||||
IMPLEMENT_TEST(xyz );
|
||||
IMPLEMENT_TEST(w );
|
||||
IMPLEMENT_TEST(xw );
|
||||
IMPLEMENT_TEST(yw );
|
||||
IMPLEMENT_TEST(xyw );
|
||||
IMPLEMENT_TEST(zw );
|
||||
IMPLEMENT_TEST(xzw );
|
||||
IMPLEMENT_TEST(yzw );
|
||||
IMPLEMENT_TEST(xyzw);
|
||||
#undef IMPLEMENT_TEST
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xyAB(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_ABxy(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CDzw(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zwCD(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xAyB(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_yBxA(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zCwD(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CzDw(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_x(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_y(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_z(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE float float4_w(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_ld(const void* _ptr);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_ld(float _x, float _y, float _z, float _w);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_splat(const void* _ptr);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_splat(float _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_isplat(uint32_t _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_zero();
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_itof(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_ftoi(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_round(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_add(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_sub(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_mul(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_div(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_rcp_est(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_sqrt(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_rsqrt_est(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_dot3(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_dot(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_cmpeq(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_cmplt(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_cmple(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_cmpgt(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_cmpge(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_min(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_max(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_and(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_andc(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_or(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_xor(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_sll(Ty _a, int _count);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_srl(Ty _a, int _count);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_sra(Ty _a, int _count);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_icmpeq(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_icmplt(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_icmpgt(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_imin(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_imax(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_iadd(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_FORCE_INLINE Ty float4_isub(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_shuf_xAzC(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_shuf_yBwD(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rcp(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_orx(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_orc(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_neg(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_madd(Ty _a, Ty _b, Ty _c);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_nmsub(Ty _a, Ty _b, Ty _c);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_div_nr(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_selb(Ty _mask, Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_sels(Ty _test, Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_not(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_abs(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_clamp(Ty _a, Ty _min, Ty _max);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_lerp(Ty _a, Ty _b, Ty _s);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt_nr(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_rsqrt_carmack(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_sqrt_nr(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_log2(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_exp2(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_pow(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_cross3(Ty _a, Ty _b);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_normalize3(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_ceil(Ty _a);
|
||||
|
||||
template<typename Ty>
|
||||
BX_FLOAT4_INLINE Ty float4_floor(Ty _a);
|
||||
|
||||
} // namespace bx
|
||||
|
||||
#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
|
||||
# include "float4_sse.h"
|
||||
@@ -29,7 +369,50 @@
|
||||
# pragma message("************************************\nUsing SIMD reference implementation!\n************************************")
|
||||
# endif // BX_FLOAT4_WARN_REFERENCE_IMPL
|
||||
|
||||
# include "float4_ref.h"
|
||||
namespace bx
|
||||
{
|
||||
typedef float4_ref_t float4_t;
|
||||
}
|
||||
#endif //
|
||||
|
||||
#include "float4_ref.h"
|
||||
|
||||
namespace bx
|
||||
{
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_zero()
|
||||
{
|
||||
return float4_zero<float4_t>();
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr)
|
||||
{
|
||||
return float4_ld<float4_t>(_ptr);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
|
||||
{
|
||||
return float4_ld<float4_t>(_x, _y, _z, _w);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
|
||||
{
|
||||
return float4_ild<float4_t>(_x, _y, _z, _w);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr)
|
||||
{
|
||||
return float4_splat<float4_t>(_ptr);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a)
|
||||
{
|
||||
return float4_splat<float4_t>(_a);
|
||||
}
|
||||
|
||||
BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a)
|
||||
{
|
||||
return float4_isplat<float4_t>(_a);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // BX_FLOAT4_T_H_HEADER_GUARD
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
#if BX_COMPILER_GCC || BX_COMPILER_CLANG
|
||||
# define BX_ALIGN_DECL(_align, _decl) _decl __attribute__( (aligned(_align) ) )
|
||||
# define BX_ALLOW_UNUSED __attribute__( (unused) )
|
||||
# define BX_FORCE_INLINE __extension__ static __inline __attribute__( (__always_inline__) )
|
||||
# define BX_FORCE_INLINE inline __attribute__( (__always_inline__) )
|
||||
# define BX_FUNCTION __PRETTY_FUNCTION__
|
||||
# define BX_LIKELY(_x) __builtin_expect(!!(_x), 1)
|
||||
# define BX_UNLIKELY(_x) __builtin_expect(!!(_x), 0)
|
||||
@@ -71,7 +71,7 @@
|
||||
# define BX_NO_RETURN __attribute__( (noreturn) )
|
||||
# define BX_NO_VTABLE
|
||||
# define BX_OVERRIDE
|
||||
# define BX_PRINTF_ARGS(_format, _args) __attribute__ ( (format(__printf__, _format, _args) ) )
|
||||
# define BX_PRINTF_ARGS(_format, _args) __attribute__( (format(__printf__, _format, _args) ) )
|
||||
# if BX_CLANG_HAS_FEATURE(cxx_thread_local)
|
||||
# define BX_THREAD_LOCAL __thread
|
||||
# endif // BX_COMPILER_CLANG
|
||||
|
||||
Reference in New Issue
Block a user