From 224e990ee4cbfdede21daa790970abebe141836c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Branimir=20Karad=C5=BEi=C4=87?= Date: Mon, 18 Jul 2016 19:03:09 -0700 Subject: [PATCH] Renaming float4_t to simd128_t. --- include/bx/float4_neon.h | 562 --------------- include/bx/float4_ni.h | 558 --------------- include/bx/float4_sse.h | 647 ------------------ include/bx/float4_t.h | 436 ------------ include/bx/float4x4_t.h | 208 +++--- .../{float4_langext.h => simd128_langext.inl} | 213 +++--- include/bx/simd128_neon.inl | 562 +++++++++++++++ include/bx/{float4_ref.h => simd128_ref.inl} | 272 ++++---- include/bx/simd128_sse.inl | 647 ++++++++++++++++++ include/bx/simd_ni.inl | 558 +++++++++++++++ .../{float4_swizzle.inl => simd_swizzle.inl} | 532 +++++++------- include/bx/simd_t.h | 436 ++++++++++++ tests/float4_t.cpp | 309 --------- tests/simd_t.cpp | 309 +++++++++ 14 files changed, 3125 insertions(+), 3124 deletions(-) delete mode 100644 include/bx/float4_neon.h delete mode 100644 include/bx/float4_ni.h delete mode 100644 include/bx/float4_sse.h delete mode 100644 include/bx/float4_t.h rename include/bx/{float4_langext.h => simd128_langext.inl} (53%) create mode 100644 include/bx/simd128_neon.inl rename include/bx/{float4_ref.h => simd128_ref.inl} (64%) create mode 100644 include/bx/simd128_sse.inl create mode 100644 include/bx/simd_ni.inl rename include/bx/{float4_swizzle.inl => simd_swizzle.inl} (95%) create mode 100644 include/bx/simd_t.h delete mode 100644 tests/float4_t.cpp create mode 100644 tests/simd_t.cpp diff --git a/include/bx/float4_neon.h b/include/bx/float4_neon.h deleted file mode 100644 index 32bda46..0000000 --- a/include/bx/float4_neon.h +++ /dev/null @@ -1,562 +0,0 @@ -/* - * Copyright 2010-2016 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx#license-bsd-2-clause - */ - -#ifndef BX_FLOAT4_NEON_H_HEADER_GUARD -#define BX_FLOAT4_NEON_H_HEADER_GUARD - -#define float4_rcp float4_rcp_ni -#define float4_orx float4_orx_ni -#define float4_orc float4_orc_ni -#define float4_neg float4_neg_ni -#define float4_madd float4_madd_ni -#define float4_nmsub float4_nmsub_ni -#define float4_div_nr float4_div_nr_ni -#define float4_div float4_div_nr_ni -#define float4_selb float4_selb_ni -#define float4_sels float4_sels_ni -#define float4_not float4_not_ni -#define float4_abs float4_abs_ni -#define float4_clamp float4_clamp_ni -#define float4_lerp float4_lerp_ni -#define float4_rsqrt float4_rsqrt_ni -#define float4_rsqrt_nr float4_rsqrt_nr_ni -#define float4_rsqrt_carmack float4_rsqrt_carmack_ni -#define float4_sqrt_nr float4_sqrt_nr_ni -#define float4_sqrt float4_sqrt_nr_ni -#define float4_log2 float4_log2_ni -#define float4_exp2 float4_exp2_ni -#define float4_pow float4_pow_ni -#define float4_cross3 float4_cross3_ni -#define float4_normalize3 float4_normalize3_ni -#define float4_dot3 float4_dot3_ni -#define float4_dot float4_dot_ni -#define float4_ceil float4_ceil_ni -#define float4_floor float4_floor_ni - -#include "float4_ni.h" - -namespace bx -{ -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_swiz_##_x##_y##_z##_w(float4_neon_t _a) \ - { \ - return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ - } - -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw, _swizzle) \ - template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_neon_t _test) \ - { \ - const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \ - return float4_test_any_ni(tmp0); \ - } \ - \ - template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_neon_t _test) \ - { \ - const float4_neon_t tmp0 = float4_swiz_##_swizzle(_test); \ - return float4_test_all_ni(tmp0); \ - } - -IMPLEMENT_TEST(x, xxxx); -IMPLEMENT_TEST(y, yyyy); -IMPLEMENT_TEST(xy, xyyy); -IMPLEMENT_TEST(z, zzzz); -IMPLEMENT_TEST(xz, xzzz); -IMPLEMENT_TEST(yz, yzzz); -IMPLEMENT_TEST(xyz, xyzz); -IMPLEMENT_TEST(w, wwww); -IMPLEMENT_TEST(xw, xwww); -IMPLEMENT_TEST(yw, ywww); -IMPLEMENT_TEST(xyw, xyww); -IMPLEMENT_TEST(zw, zwww); -IMPLEMENT_TEST(xzw, xzww); -IMPLEMENT_TEST(yzw, yzww); -#undef IMPLEMENT_TEST - - template<> - BX_FLOAT4_FORCE_INLINE bool float4_test_any_xyzw(float4_neon_t _test) - { - return float4_test_any_ni(_test); - } - - template<> - BX_FLOAT4_FORCE_INLINE bool float4_test_all_xyzw(float4_neon_t _test) - { - return float4_test_all_ni(_test); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xyAB(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_ABxy(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CDzw(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zwCD(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_xAyB(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_yBxA(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_zCwD(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_shuf_CzDw(float4_neon_t _a, float4_neon_t _b) - { - return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_x(float4_neon_t _a) - { - return vgetq_lane_f32(_a, 0); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_y(float4_neon_t _a) - { - return vgetq_lane_f32(_a, 1); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_z(float4_neon_t _a) - { - return vgetq_lane_f32(_a, 2); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_w(float4_neon_t _a) - { - return vgetq_lane_f32(_a, 3); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(const void* _ptr) - { - return vld1q_f32( (const float32_t*)_ptr); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_neon_t _a) - { - vst1q_f32( (float32_t*)_ptr, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_neon_t _a) - { - vst1q_lane_f32( (float32_t*)_ptr, _a, 0); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_neon_t _a) - { - vst1q_f32( (float32_t*)_ptr, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ld(float _x, float _y, float _z, float _w) - { - const float32_t val[4] = {_x, _y, _z, _w}; - return float4_ld(val); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - const uint32_t val[4] = {_x, _y, _z, _w}; - const uint32x4_t tmp = vld1q_u32(val); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(const void* _ptr) - { - const float4_neon_t tmp0 = vld1q_f32( (const float32_t*)_ptr); - const float32x2_t tmp1 = vget_low_f32(tmp0); - const float4_neon_t result = vdupq_lane_f32(tmp1, 0); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_splat(float _a) - { - return vdupq_n_f32(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isplat(uint32_t _a) - { - const int32x4_t tmp = vdupq_n_s32(_a); - const float4_neon_t result = vreinterpretq_f32_s32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_zero() - { - return float4_isplat(0); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_itof(float4_neon_t _a) - { - const int32x4_t itof = vreinterpretq_s32_f32(_a); - const float4_neon_t result = vcvtq_f32_s32(itof); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_ftoi(float4_neon_t _a) - { - const int32x4_t ftoi = vcvtq_s32_f32(_a); - const float4_neon_t result = vreinterpretq_f32_s32(ftoi); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_add(float4_neon_t _a, float4_neon_t _b) - { - return vaddq_f32(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sub(float4_neon_t _a, float4_neon_t _b) - { - return vsubq_f32(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_mul(float4_neon_t _a, float4_neon_t _b) - { - return vmulq_f32(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rcp_est(float4_neon_t _a) - { - return vrecpeq_f32(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_rsqrt_est(float4_neon_t _a) - { - return vrsqrteq_f32(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpeq(float4_neon_t _a, float4_neon_t _b) - { - const uint32x4_t tmp = vceqq_f32(_a, _b); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmplt(float4_neon_t _a, float4_neon_t _b) - { - const uint32x4_t tmp = vcltq_f32(_a, _b); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmple(float4_neon_t _a, float4_neon_t _b) - { - const uint32x4_t tmp = vcleq_f32(_a, _b); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpgt(float4_neon_t _a, float4_neon_t _b) - { - const uint32x4_t tmp = vcgtq_f32(_a, _b); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_cmpge(float4_neon_t _a, float4_neon_t _b) - { - const uint32x4_t tmp = vcgeq_f32(_a, _b); - const float4_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_min(float4_neon_t _a, float4_neon_t _b) - { - return vminq_f32(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_max(float4_neon_t _a, float4_neon_t _b) - { - return vmaxq_f32(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_and(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_andc(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_or(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_xor(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sll(float4_neon_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); - const float4_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const int32x4_t shift = vdupq_n_s32(_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const float4_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_srl(float4_neon_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); - const float4_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const int32x4_t shift = vdupq_n_s32(-_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const float4_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_sra(float4_neon_t _a, int _count) - { - if (__builtin_constant_p(_count) ) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); - const float4_neon_t result = vreinterpretq_f32_s32(tmp1); - - return result; - } - - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t shift = vdupq_n_s32(-_count); - const int32x4_t tmp1 = vshlq_s32(tmp0, shift); - const float4_neon_t result = vreinterpretq_f32_s32(tmp1); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_madd(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c) - { - return vmlaq_f32(_c, _a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_nmsub(float4_neon_t _a, float4_neon_t _b, float4_neon_t _c) - { - return vmlsq_f32(_c, _a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpeq(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmplt(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_icmpgt(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imin(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_imax(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_iadd(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_neon_t float4_isub(float4_neon_t _a, float4_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); - const float4_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_FLOAT4_INLINE float4_neon_t float4_shuf_xAzC(float4_neon_t _a, float4_neon_t _b) - { - return float4_shuf_xAzC_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_neon_t float4_shuf_yBwD(float4_neon_t _a, float4_neon_t _b) - { - return float4_shuf_yBwD_ni(_a, _b); - } - - typedef float4_neon_t float4_t; - -} // namespace bx - -#endif // BX_FLOAT4_NEON_H_HEADER_GUARD diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h deleted file mode 100644 index a0dc65f..0000000 --- a/include/bx/float4_ni.h +++ /dev/null @@ -1,558 +0,0 @@ -/* - * Copyright 2010-2016 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx#license-bsd-2-clause - */ - -#ifndef BX_FLOAT4_NI_H_HEADER_GUARD -#define BX_FLOAT4_NI_H_HEADER_GUARD - -namespace bx -{ - template - BX_FLOAT4_INLINE Ty float4_shuf_xAzC_ni(Ty _a, Ty _b) - { - const Ty xAyB = float4_shuf_xAyB(_a, _b); - const Ty zCwD = float4_shuf_zCwD(_a, _b); - const Ty result = float4_shuf_xyAB(xAyB, zCwD); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_shuf_yBwD_ni(Ty _a, Ty _b) - { - const Ty xAyB = float4_shuf_xAyB(_a, _b); - const Ty zCwD = float4_shuf_zCwD(_a, _b); - const Ty result = float4_shuf_zwCD(xAyB, zCwD); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_madd_ni(Ty _a, Ty _b, Ty _c) - { - const Ty mul = float4_mul(_a, _b); - const Ty result = float4_add(mul, _c); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_nmsub_ni(Ty _a, Ty _b, Ty _c) - { - const Ty mul = float4_mul(_a, _b); - const Ty result = float4_sub(_c, mul); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_div_nr_ni(Ty _a, Ty _b) - { - const Ty oneish = float4_isplat(0x3f800001); - const Ty est = float4_rcp_est(_b); - const Ty iter0 = float4_mul(_a, est); - const Ty tmp1 = float4_nmsub(_b, est, oneish); - const Ty result = float4_madd(tmp1, iter0, iter0); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_rcp_ni(Ty _a) - { - const Ty one = float4_splat(1.0f); - const Ty result = float4_div(one, _a); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_orx_ni(Ty _a) - { - const Ty zwxy = float4_swiz_zwxy(_a); - const Ty tmp0 = float4_or(_a, zwxy); - const Ty tmp1 = float4_swiz_yyyy(_a); - const Ty tmp2 = float4_or(tmp0, tmp1); - const Ty mf000 = float4_ild(UINT32_MAX, 0, 0, 0); - const Ty result = float4_and(tmp2, mf000); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_orc_ni(Ty _a, Ty _b) - { - const Ty aorb = float4_or(_a, _b); - const Ty mffff = float4_isplat(UINT32_MAX); - const Ty result = float4_xor(aorb, mffff); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_neg_ni(Ty _a) - { - const Ty zero = float4_zero(); - const Ty result = float4_sub(zero, _a); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_selb_ni(Ty _mask, Ty _a, Ty _b) - { - const Ty sel_a = float4_and(_a, _mask); - const Ty sel_b = float4_andc(_b, _mask); - const Ty result = float4_or(sel_a, sel_b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_sels_ni(Ty _test, Ty _a, Ty _b) - { - const Ty mask = float4_sra(_test, 31); - const Ty result = float4_selb(mask, _a, _b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_not_ni(Ty _a) - { - const Ty mffff = float4_isplat(UINT32_MAX); - const Ty result = float4_xor(_a, mffff); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_min_ni(Ty _a, Ty _b) - { - const Ty mask = float4_cmplt(_a, _b); - const Ty result = float4_selb(mask, _a, _b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_max_ni(Ty _a, Ty _b) - { - const Ty mask = float4_cmpgt(_a, _b); - const Ty result = float4_selb(mask, _a, _b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_abs_ni(Ty _a) - { - const Ty a_neg = float4_neg(_a); - const Ty result = float4_max(a_neg, _a); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_imin_ni(Ty _a, Ty _b) - { - const Ty mask = float4_icmplt(_a, _b); - const Ty result = float4_selb(mask, _a, _b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_imax_ni(Ty _a, Ty _b) - { - const Ty mask = float4_icmpgt(_a, _b); - const Ty result = float4_selb(mask, _a, _b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_clamp_ni(Ty _a, Ty _min, Ty _max) - { - const Ty tmp = float4_min(_a, _max); - const Ty result = float4_max(tmp, _min); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_lerp_ni(Ty _a, Ty _b, Ty _s) - { - const Ty ba = float4_sub(_b, _a); - const Ty result = float4_madd(_s, ba, _a); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_sqrt_nr_ni(Ty _a) - { - const Ty half = float4_splat(0.5f); - const Ty one = float4_splat(1.0f); - const Ty tmp0 = float4_rsqrt_est(_a); - const Ty tmp1 = float4_mul(tmp0, _a); - const Ty tmp2 = float4_mul(tmp1, half); - const Ty tmp3 = float4_nmsub(tmp0, tmp1, one); - const Ty result = float4_madd(tmp3, tmp2, tmp1); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_sqrt_nr1_ni(Ty _a) - { - const Ty half = float4_splat(0.5f); - - Ty result = _a; - for (uint32_t ii = 0; ii < 11; ++ii) - { - const Ty tmp1 = float4_div(_a, result); - const Ty tmp2 = float4_add(tmp1, result); - result = float4_mul(tmp2, half); - } - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_rsqrt_ni(Ty _a) - { - const Ty one = float4_splat(1.0f); - const Ty sqrt = float4_sqrt(_a); - const Ty result = float4_div(one, sqrt); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_rsqrt_nr_ni(Ty _a) - { - const Ty rsqrt = float4_rsqrt_est(_a); - const Ty iter0 = float4_mul(_a, rsqrt); - const Ty iter1 = float4_mul(iter0, rsqrt); - const Ty half = float4_splat(0.5f); - const Ty half_rsqrt = float4_mul(half, rsqrt); - const Ty three = float4_splat(3.0f); - const Ty three_sub_iter1 = float4_sub(three, iter1); - const Ty result = float4_mul(half_rsqrt, three_sub_iter1); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_rsqrt_carmack_ni(Ty _a) - { - const Ty half = float4_splat(0.5f); - const Ty ah = float4_mul(half, _a); - const Ty ashift = float4_sra(_a, 1); - const Ty magic = float4_isplat(0x5f3759df); - const Ty msuba = float4_isub(magic, ashift); - const Ty msubasq = float4_mul(msuba, msuba); - const Ty tmp0 = float4_splat(1.5f); - const Ty tmp1 = float4_mul(ah, msubasq); - const Ty tmp2 = float4_sub(tmp0, tmp1); - const Ty result = float4_mul(msuba, tmp2); - - return result; - } - - namespace float4_logexp_detail - { - template - BX_FLOAT4_INLINE Ty float4_poly1(Ty _a, float _b, float _c) - { - const Ty bbbb = float4_splat(_b); - const Ty cccc = float4_splat(_c); - const Ty result = float4_madd(cccc, _a, bbbb); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_poly2(Ty _a, float _b, float _c, float _d) - { - const Ty bbbb = float4_splat(_b); - const Ty poly = float4_poly1(_a, _c, _d); - const Ty result = float4_madd(poly, _a, bbbb); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_poly3(Ty _a, float _b, float _c, float _d, float _e) - { - const Ty bbbb = float4_splat(_b); - const Ty poly = float4_poly2(_a, _c, _d, _e); - const Ty result = float4_madd(poly, _a, bbbb); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f) - { - const Ty bbbb = float4_splat(_b); - const Ty poly = float4_poly3(_a, _c, _d, _e, _f); - const Ty result = float4_madd(poly, _a, bbbb); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g) - { - const Ty bbbb = float4_splat(_b); - const Ty poly = float4_poly4(_a, _c, _d, _e, _f, _g); - const Ty result = float4_madd(poly, _a, bbbb); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_logpoly(Ty _a) - { -#if 1 - const Ty result = float4_poly5(_a - , 3.11578814719469302614f, -3.32419399085241980044f - , 2.59883907202499966007f, -1.23152682416275988241f - , 0.318212422185251071475f, -0.0344359067839062357313f - ); -#elif 0 - const Ty result = float4_poly4(_a - , 2.8882704548164776201f, -2.52074962577807006663f - , 1.48116647521213171641f, -0.465725644288844778798f - , 0.0596515482674574969533f - ); -#elif 0 - const Ty result = float4_poly3(_a - , 2.61761038894603480148f, -1.75647175389045657003f - , 0.688243882994381274313f, -0.107254423828329604454f - ); -#else - const Ty result = float4_poly2(_a - , 2.28330284476918490682f, -1.04913055217340124191f - , 0.204446009836232697516f - ); -#endif - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_exppoly(Ty _a) - { -#if 1 - const Ty result = float4_poly5(_a - , 9.9999994e-1f, 6.9315308e-1f - , 2.4015361e-1f, 5.5826318e-2f - , 8.9893397e-3f, 1.8775767e-3f - ); -#elif 0 - const Ty result = float4_poly4(_a - , 1.0000026f, 6.9300383e-1f - , 2.4144275e-1f, 5.2011464e-2f - , 1.3534167e-2f - ); -#elif 0 - const Ty result = float4_poly3(_a - , 9.9992520e-1f, 6.9583356e-1f - , 2.2606716e-1f, 7.8024521e-2f - ); -#else - const Ty result = float4_poly2(_a - , 1.0017247f, 6.5763628e-1f - , 3.3718944e-1f - ); -#endif // 0 - - return result; - } - } // namespace float4_internal - - template - BX_FLOAT4_INLINE Ty float4_log2_ni(Ty _a) - { - const Ty expmask = float4_isplat(0x7f800000); - const Ty mantmask = float4_isplat(0x007fffff); - const Ty one = float4_splat(1.0f); - - const Ty c127 = float4_isplat(127); - const Ty aexp = float4_and(_a, expmask); - const Ty aexpsr = float4_srl(aexp, 23); - const Ty tmp0 = float4_isub(aexpsr, c127); - const Ty exp = float4_itof(tmp0); - - const Ty amask = float4_and(_a, mantmask); - const Ty mant = float4_or(amask, one); - - const Ty poly = float4_logexp_detail::float4_logpoly(mant); - - const Ty mandiff = float4_sub(mant, one); - const Ty result = float4_madd(poly, mandiff, exp); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_exp2_ni(Ty _a) - { - const Ty min = float4_splat( 129.0f); - const Ty max = float4_splat(-126.99999f); - const Ty tmp0 = float4_min(_a, min); - const Ty aaaa = float4_max(tmp0, max); - - const Ty half = float4_splat(0.5f); - const Ty tmp2 = float4_sub(aaaa, half); - const Ty ipart = float4_ftoi(tmp2); - const Ty iround = float4_itof(ipart); - const Ty fpart = float4_sub(aaaa, iround); - - const Ty c127 = float4_isplat(127); - const Ty tmp5 = float4_iadd(ipart, c127); - const Ty expipart = float4_sll(tmp5, 23); - - const Ty expfpart = float4_logexp_detail::float4_exppoly(fpart); - - const Ty result = float4_mul(expipart, expfpart); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_pow_ni(Ty _a, Ty _b) - { - const Ty alog2 = float4_log2(_a); - const Ty alog2b = float4_mul(alog2, _b); - const Ty result = float4_exp2(alog2b); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_dot3_ni(Ty _a, Ty _b) - { - const Ty xyzw = float4_mul(_a, _b); - const Ty xxxx = float4_swiz_xxxx(xyzw); - const Ty yyyy = float4_swiz_yyyy(xyzw); - const Ty zzzz = float4_swiz_zzzz(xyzw); - const Ty tmp1 = float4_add(xxxx, yyyy); - const Ty result = float4_add(zzzz, tmp1); - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_cross3_ni(Ty _a, Ty _b) - { - // a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx -#if 0 - const Ty a_yzxw = float4_swiz_yzxw(_a); - const Ty a_zxyw = float4_swiz_zxyw(_a); - const Ty b_zxyw = float4_swiz_zxyw(_b); - const Ty b_yzxw = float4_swiz_yzxw(_b); - const Ty tmp = float4_mul(a_yzxw, b_zxyw); - const Ty result = float4_nmsub(a_zxyw, b_yzxw, tmp); -#else - const Ty a_yzxw = float4_swiz_yzxw(_a); - const Ty b_yzxw = float4_swiz_yzxw(_b); - const Ty tmp0 = float4_mul(_a, b_yzxw); - const Ty tmp1 = float4_nmsub(a_yzxw, _b, tmp0); - const Ty result = float4_swiz_yzxw(tmp1); -#endif - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_normalize3_ni(Ty _a) - { - const Ty dot3 = float4_dot3(_a, _a); - const Ty invSqrt = float4_rsqrt(dot3); - const Ty result = float4_mul(_a, invSqrt); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_dot_ni(Ty _a, Ty _b) - { - const Ty xyzw = float4_mul(_a, _b); - const Ty yzwx = float4_swiz_yzwx(xyzw); - const Ty tmp0 = float4_add(xyzw, yzwx); - const Ty zwxy = float4_swiz_zwxy(tmp0); - const Ty result = float4_add(tmp0, zwxy); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_ceil_ni(Ty _a) - { - const Ty tmp0 = float4_ftoi(_a); - const Ty tmp1 = float4_itof(tmp0); - const Ty mask = float4_cmplt(tmp1, _a); - const Ty one = float4_splat(1.0f); - const Ty tmp2 = float4_and(one, mask); - const Ty result = float4_add(tmp1, tmp2); - - return result; - } - - template - BX_FLOAT4_INLINE Ty float4_floor_ni(Ty _a) - { - const Ty tmp0 = float4_ftoi(_a); - const Ty tmp1 = float4_itof(tmp0); - const Ty mask = float4_cmpgt(tmp1, _a); - const Ty one = float4_splat(1.0f); - const Ty tmp2 = float4_and(one, mask); - const Ty result = float4_sub(tmp1, tmp2); - - return result; - } - - template - BX_FLOAT4_FORCE_INLINE Ty float4_round_ni(Ty _a) - { - const Ty tmp = float4_ftoi(_a); - const Ty result = float4_itof(tmp); - - return result; - } - - template - BX_FLOAT4_INLINE bool float4_test_any_ni(Ty _a) - { - const Ty mask = float4_sra(_a, 31); - const Ty zwxy = float4_swiz_zwxy(mask); - const Ty tmp0 = float4_or(mask, zwxy); - const Ty tmp1 = float4_swiz_yyyy(tmp0); - const Ty tmp2 = float4_or(tmp0, tmp1); - int res; - float4_stx(&res, tmp2); - return 0 != res; - } - - template - BX_FLOAT4_INLINE bool float4_test_all_ni(Ty _a) - { - const Ty bits = float4_sra(_a, 31); - const Ty m1248 = float4_ild(1, 2, 4, 8); - const Ty mask = float4_and(bits, m1248); - const Ty zwxy = float4_swiz_zwxy(mask); - const Ty tmp0 = float4_or(mask, zwxy); - const Ty tmp1 = float4_swiz_yyyy(tmp0); - const Ty tmp2 = float4_or(tmp0, tmp1); - int res; - float4_stx(&res, tmp2); - return 0xf == res; - } - -} // namespace bx - -#endif // BX_FLOAT4_NI_H_HEADER_GUARD diff --git a/include/bx/float4_sse.h b/include/bx/float4_sse.h deleted file mode 100644 index f5e91fe..0000000 --- a/include/bx/float4_sse.h +++ /dev/null @@ -1,647 +0,0 @@ -/* - * Copyright 2010-2016 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx#license-bsd-2-clause - */ - -#ifndef BX_FLOAT4_SSE_H_HEADER_GUARD -#define BX_FLOAT4_SSE_H_HEADER_GUARD - -#include "float4_ni.h" - -namespace bx -{ -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_swiz_##_x##_y##_z##_w(float4_sse_t _a) \ - { \ - return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \ - } - -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw, _mask) \ - template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_sse_t _test) \ - { \ - return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ - } \ - \ - template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_sse_t _test) \ - { \ - return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ - } - -IMPLEMENT_TEST(x , 0x1); -IMPLEMENT_TEST(y , 0x2); -IMPLEMENT_TEST(xy , 0x3); -IMPLEMENT_TEST(z , 0x4); -IMPLEMENT_TEST(xz , 0x5); -IMPLEMENT_TEST(yz , 0x6); -IMPLEMENT_TEST(xyz , 0x7); -IMPLEMENT_TEST(w , 0x8); -IMPLEMENT_TEST(xw , 0x9); -IMPLEMENT_TEST(yw , 0xa); -IMPLEMENT_TEST(xyw , 0xb); -IMPLEMENT_TEST(zw , 0xc); -IMPLEMENT_TEST(xzw , 0xd); -IMPLEMENT_TEST(yzw , 0xe); -IMPLEMENT_TEST(xyzw , 0xf); - -#undef IMPLEMENT_TEST - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xyAB(float4_sse_t _a, float4_sse_t _b) - { - return _mm_movelh_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_ABxy(float4_sse_t _a, float4_sse_t _b) - { - return _mm_movelh_ps(_b, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CDzw(float4_sse_t _a, float4_sse_t _b) - { - return _mm_movehl_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zwCD(float4_sse_t _a, float4_sse_t _b) - { - return _mm_movehl_ps(_b, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_xAyB(float4_sse_t _a, float4_sse_t _b) - { - return _mm_unpacklo_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_yBxA(float4_sse_t _a, float4_sse_t _b) - { - return _mm_unpacklo_ps(_b, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_zCwD(float4_sse_t _a, float4_sse_t _b) - { - return _mm_unpackhi_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_shuf_CzDw(float4_sse_t _a, float4_sse_t _b) - { - return _mm_unpackhi_ps(_b, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_x(float4_sse_t _a) - { - return _mm_cvtss_f32(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_y(float4_sse_t _a) - { - const float4_sse_t yyyy = float4_swiz_yyyy(_a); - const float result = _mm_cvtss_f32(yyyy); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_z(float4_sse_t _a) - { - const float4_sse_t zzzz = float4_swiz_zzzz(_a); - const float result = _mm_cvtss_f32(zzzz); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float float4_w(float4_sse_t _a) - { - const float4_sse_t wwww = float4_swiz_wwww(_a); - const float result = _mm_cvtss_f32(wwww); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(const void* _ptr) - { - return _mm_load_ps(reinterpret_cast(_ptr) ); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_sse_t _a) - { - _mm_store_ps(reinterpret_cast(_ptr), _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_sse_t _a) - { - _mm_store_ss(reinterpret_cast(_ptr), _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_sse_t _a) - { - _mm_stream_ps(reinterpret_cast(_ptr), _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ld(float _x, float _y, float _z, float _w) - { - return _mm_set_ps(_w, _z, _y, _x); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - const __m128i set = _mm_set_epi32(_w, _z, _y, _x); - const float4_sse_t result = _mm_castsi128_ps(set); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(const void* _ptr) - { - const float4_sse_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); - const float4_sse_t result = float4_swiz_xxxx(x___); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_splat(float _a) - { - return _mm_set1_ps(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isplat(uint32_t _a) - { - const __m128i splat = _mm_set1_epi32(_a); - const float4_sse_t result = _mm_castsi128_ps(splat); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_zero() - { - return _mm_setzero_ps(); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_itof(float4_sse_t _a) - { - const __m128i itof = _mm_castps_si128(_a); - const float4_sse_t result = _mm_cvtepi32_ps(itof); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_ftoi(float4_sse_t _a) - { - const __m128i ftoi = _mm_cvtps_epi32(_a); - const float4_sse_t result = _mm_castsi128_ps(ftoi); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_round(float4_sse_t _a) - { -#if defined(__SSE4_1__) - return _mm_round_ps(_a, _MM_FROUND_NINT); -#else - const __m128i round = _mm_cvtps_epi32(_a); - const float4_sse_t result = _mm_cvtepi32_ps(round); - - return result; -#endif // defined(__SSE4_1__) - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_add(float4_sse_t _a, float4_sse_t _b) - { - return _mm_add_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sub(float4_sse_t _a, float4_sse_t _b) - { - return _mm_sub_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_mul(float4_sse_t _a, float4_sse_t _b) - { - return _mm_mul_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_div(float4_sse_t _a, float4_sse_t _b) - { - return _mm_div_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rcp_est(float4_sse_t _a) - { - return _mm_rcp_ps(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sqrt(float4_sse_t _a) - { - return _mm_sqrt_ps(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_rsqrt_est(float4_sse_t _a) - { - return _mm_rsqrt_ps(_a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot3(float4_sse_t _a, float4_sse_t _b) - { -#if defined(__SSE4_1__) - return _mm_dp_ps(_a, _b, 0x77); -#else - return float4_dot3_ni(_a, _b); -#endif // defined(__SSE4__) - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_dot(float4_sse_t _a, float4_sse_t _b) - { -#if defined(__SSE4_1__) - return _mm_dp_ps(_a, _b, 0xFF); -#else - return float4_dot_ni(_a, _b); -#endif // defined(__SSE4__) - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpeq(float4_sse_t _a, float4_sse_t _b) - { - return _mm_cmpeq_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmplt(float4_sse_t _a, float4_sse_t _b) - { - return _mm_cmplt_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmple(float4_sse_t _a, float4_sse_t _b) - { - return _mm_cmple_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpgt(float4_sse_t _a, float4_sse_t _b) - { - return _mm_cmpgt_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_cmpge(float4_sse_t _a, float4_sse_t _b) - { - return _mm_cmpge_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_min(float4_sse_t _a, float4_sse_t _b) - { - return _mm_min_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_max(float4_sse_t _a, float4_sse_t _b) - { - return _mm_max_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_and(float4_sse_t _a, float4_sse_t _b) - { - return _mm_and_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_andc(float4_sse_t _a, float4_sse_t _b) - { - return _mm_andnot_ps(_b, _a); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_or(float4_sse_t _a, float4_sse_t _b) - { - return _mm_or_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_xor(float4_sse_t _a, float4_sse_t _b) - { - return _mm_xor_ps(_a, _b); - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sll(float4_sse_t _a, int _count) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_slli_epi32(a, _count); - const float4_sse_t result = _mm_castsi128_ps(shift); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_srl(float4_sse_t _a, int _count) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_srli_epi32(a, _count); - const float4_sse_t result = _mm_castsi128_ps(shift); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_sra(float4_sse_t _a, int _count) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_srai_epi32(a, _count); - const float4_sse_t result = _mm_castsi128_ps(shift); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpeq(float4_sse_t _a, float4_sse_t _b) - { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmpeq_epi32(tmp0, tmp1); - const float4_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmplt(float4_sse_t _a, float4_sse_t _b) - { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmplt_epi32(tmp0, tmp1); - const float4_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_icmpgt(float4_sse_t _a, float4_sse_t _b) - { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmpgt_epi32(tmp0, tmp1); - const float4_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imin(float4_sse_t _a, float4_sse_t _b) - { -#if defined(__SSE4_1__) - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_min_epi32(tmp0, tmp1); - const float4_sse_t result = _mm_castsi128_ps(tmp2); - - return result; -#else - return float4_imin_ni(_a, _b); -#endif // defined(__SSE4_1__) - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_imax(float4_sse_t _a, float4_sse_t _b) - { -#if defined(__SSE4_1__) - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_max_epi32(tmp0, tmp1); - const float4_sse_t result = _mm_castsi128_ps(tmp2); - - return result; -#else - return float4_imax_ni(_a, _b); -#endif // defined(__SSE4_1__) - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_iadd(float4_sse_t _a, float4_sse_t _b) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i b = _mm_castps_si128(_b); - const __m128i add = _mm_add_epi32(a, b); - const float4_sse_t result = _mm_castsi128_ps(add); - - return result; - } - - template<> - BX_FLOAT4_FORCE_INLINE float4_sse_t float4_isub(float4_sse_t _a, float4_sse_t _b) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i b = _mm_castps_si128(_b); - const __m128i sub = _mm_sub_epi32(a, b); - const float4_sse_t result = _mm_castsi128_ps(sub); - - return result; - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_shuf_xAzC(float4_sse_t _a, float4_sse_t _b) - { - return float4_shuf_xAzC_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_shuf_yBwD(float4_sse_t _a, float4_sse_t _b) - { - return float4_shuf_yBwD_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_rcp(float4_sse_t _a) - { - return float4_rcp_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_orx(float4_sse_t _a) - { - return float4_orx_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_orc(float4_sse_t _a, float4_sse_t _b) - { - return float4_orc_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_neg(float4_sse_t _a) - { - return float4_neg_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_madd(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c) - { - return float4_madd_ni(_a, _b, _c); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_nmsub(float4_sse_t _a, float4_sse_t _b, float4_sse_t _c) - { - return float4_nmsub_ni(_a, _b, _c); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_div_nr(float4_sse_t _a, float4_sse_t _b) - { - return float4_div_nr_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_selb(float4_sse_t _mask, float4_sse_t _a, float4_sse_t _b) - { - return float4_selb_ni(_mask, _a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_sels(float4_sse_t _test, float4_sse_t _a, float4_sse_t _b) - { - return float4_sels_ni(_test, _a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_not(float4_sse_t _a) - { - return float4_not_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_abs(float4_sse_t _a) - { - return float4_abs_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_clamp(float4_sse_t _a, float4_sse_t _min, float4_sse_t _max) - { - return float4_clamp_ni(_a, _min, _max); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_lerp(float4_sse_t _a, float4_sse_t _b, float4_sse_t _s) - { - return float4_lerp_ni(_a, _b, _s); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_rsqrt(float4_sse_t _a) - { - return float4_rsqrt_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_nr(float4_sse_t _a) - { - return float4_rsqrt_nr_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_rsqrt_carmack(float4_sse_t _a) - { - return float4_rsqrt_carmack_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_sqrt_nr(float4_sse_t _a) - { - return float4_sqrt_nr_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_log2(float4_sse_t _a) - { - return float4_log2_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_exp2(float4_sse_t _a) - { - return float4_exp2_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_pow(float4_sse_t _a, float4_sse_t _b) - { - return float4_pow_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_cross3(float4_sse_t _a, float4_sse_t _b) - { - return float4_cross3_ni(_a, _b); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_normalize3(float4_sse_t _a) - { - return float4_normalize3_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_ceil(float4_sse_t _a) - { - return float4_ceil_ni(_a); - } - - template<> - BX_FLOAT4_INLINE float4_sse_t float4_floor(float4_sse_t _a) - { - return float4_floor_ni(_a); - } - - typedef float4_sse_t float4_t; - -} // namespace bx - -#endif // BX_FLOAT4_SSE_H_HEADER_GUARD diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h deleted file mode 100644 index a449546..0000000 --- a/include/bx/float4_t.h +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright 2010-2016 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx#license-bsd-2-clause - */ - -#ifndef BX_FLOAT4_T_H_HEADER_GUARD -#define BX_FLOAT4_T_H_HEADER_GUARD - -#include "bx.h" - -#define BX_FLOAT4_FORCE_INLINE BX_FORCE_INLINE -#define BX_FLOAT4_INLINE inline - -#define BX_FLOAT4_SSE 0 -#define BX_FLOAT4_AVX 0 -#define BX_FLOAT4_NEON 0 -#define BX_FLOAT4_LANGEXT 0 - -#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) ) -# include // __m128i -# if defined(__SSE4_1__) -# include -# endif // defined(__SSE4_1__) -# include // __m128 -# undef BX_FLOAT4_SSE -# define BX_FLOAT4_SSE 1 - -namespace bx -{ - typedef __m128 float4_sse_t; - -} // namespace bx - -#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG -# include -# undef BX_FLOAT4_NEON -# define BX_FLOAT4_NEON 1 - -namespace bx -{ - typedef float32x4_t float4_neon_t; - -} // namespace bx - -#elif BX_COMPILER_CLANG \ - && !BX_PLATFORM_EMSCRIPTEN \ - && !BX_PLATFORM_IOS \ - && BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) -# include -# undef BX_FLOAT4_LANGEXT -# define BX_FLOAT4_LANGEXT 1 - -namespace bx -{ - union float4_langext_t - { - float __attribute__((vector_size(16))) vf; - int32_t __attribute__((vector_size(16))) vi; - uint32_t __attribute__((vector_size(16))) vu; - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - - }; -} // namespace bx -#endif // - -namespace bx -{ - union float4_ref_t - { - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - - }; -} // namespace bx - -namespace bx -{ -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template \ - BX_FLOAT4_FORCE_INLINE Ty float4_swiz_##_x##_y##_z##_w(Ty _a); -#include "float4_swizzle.inl" - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define IMPLEMENT_TEST(_xyzw) \ - template \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(Ty _test); \ - \ - template \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(Ty _test) - -IMPLEMENT_TEST(x ); -IMPLEMENT_TEST(y ); -IMPLEMENT_TEST(xy ); -IMPLEMENT_TEST(z ); -IMPLEMENT_TEST(xz ); -IMPLEMENT_TEST(yz ); -IMPLEMENT_TEST(xyz ); -IMPLEMENT_TEST(w ); -IMPLEMENT_TEST(xw ); -IMPLEMENT_TEST(yw ); -IMPLEMENT_TEST(xyw ); -IMPLEMENT_TEST(zw ); -IMPLEMENT_TEST(xzw ); -IMPLEMENT_TEST(yzw ); -IMPLEMENT_TEST(xyzw); -#undef IMPLEMENT_TEST - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xyAB(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_ABxy(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CDzw(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zwCD(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_xAyB(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_yBxA(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_zCwD(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_shuf_CzDw(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE float float4_x(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE float float4_y(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE float float4_z(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE float float4_w(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_ld(const void* _ptr); - - template - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, Ty _a); - - template - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, Ty _a); - - template - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_ld(float _x, float _y, float _z, float _w); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_splat(const void* _ptr); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_splat(float _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_isplat(uint32_t _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_zero(); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_itof(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_ftoi(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_round(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_add(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_sub(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_mul(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_div(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_rcp_est(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_sqrt(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_rsqrt_est(Ty _a); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_dot3(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_dot(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_cmpeq(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_cmplt(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_cmple(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_cmpgt(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_cmpge(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_min(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_max(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_and(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_andc(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_or(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_xor(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_sll(Ty _a, int _count); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_srl(Ty _a, int _count); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_sra(Ty _a, int _count); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_icmpeq(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_icmplt(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_icmpgt(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_imin(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_imax(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_iadd(Ty _a, Ty _b); - - template - BX_FLOAT4_FORCE_INLINE Ty float4_isub(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_shuf_xAzC(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_shuf_yBwD(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_rcp(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_orx(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_orc(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_neg(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_madd(Ty _a, Ty _b, Ty _c); - - template - BX_FLOAT4_INLINE Ty float4_nmsub(Ty _a, Ty _b, Ty _c); - - template - BX_FLOAT4_INLINE Ty float4_div_nr(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_selb(Ty _mask, Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_sels(Ty _test, Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_not(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_abs(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_clamp(Ty _a, Ty _min, Ty _max); - - template - BX_FLOAT4_INLINE Ty float4_lerp(Ty _a, Ty _b, Ty _s); - - template - BX_FLOAT4_INLINE Ty float4_rsqrt(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_rsqrt_nr(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_rsqrt_carmack(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_sqrt_nr(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_log2(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_exp2(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_pow(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_cross3(Ty _a, Ty _b); - - template - BX_FLOAT4_INLINE Ty float4_normalize3(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_ceil(Ty _a); - - template - BX_FLOAT4_INLINE Ty float4_floor(Ty _a); - -} // namespace bx - -#if BX_FLOAT4_SSE -# include "float4_sse.h" -#endif // BX_FLOAT4_SSE - -#if BX_FLOAT4_NEON -# include "float4_neon.h" -#endif // BX_FLOAT4_NEON - -#if BX_FLOAT4_LANGEXT -# include "float4_langext.h" -#endif // BX_FLOAT4_LANGEXT - -#if !( BX_FLOAT4_SSE \ - || BX_FLOAT4_AVX \ - || BX_FLOAT4_NEON \ - || BX_FLOAT4_LANGEXT \ - ) -# ifndef BX_FLOAT4_WARN_REFERENCE_IMPL -# define BX_FLOAT4_WARN_REFERENCE_IMPL 0 -# endif // BX_FLOAT4_WARN_REFERENCE_IMPL - -# if BX_FLOAT4_WARN_REFERENCE_IMPL -# pragma message("************************************\nUsing SIMD reference implementation!\n************************************") -# endif // BX_FLOAT4_WARN_REFERENCE_IMPL - -namespace bx -{ - typedef float4_ref_t float4_t; -} -#endif // - -#include "float4_ref.h" - -namespace bx -{ - BX_FLOAT4_FORCE_INLINE float4_t float4_zero() - { - return float4_zero(); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(const void* _ptr) - { - return float4_ld(_ptr); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) - { - return float4_ld(_x, _y, _z, _w); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - return float4_ild(_x, _y, _z, _w); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(const void* _ptr) - { - return float4_splat(_ptr); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_splat(float _a) - { - return float4_splat(_a); - } - - BX_FLOAT4_FORCE_INLINE float4_t float4_isplat(uint32_t _a) - { - return float4_isplat(_a); - } -} - -#endif // BX_FLOAT4_T_H_HEADER_GUARD diff --git a/include/bx/float4x4_t.h b/include/bx/float4x4_t.h index e1bc4e1..3339e0a 100644 --- a/include/bx/float4x4_t.h +++ b/include/bx/float4x4_t.h @@ -3,156 +3,156 @@ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause */ -#ifndef BX_FLOAT4X4_H_HEADER_GUARD -#define BX_FLOAT4X4_H_HEADER_GUARD +#ifndef BX_SIMDX4_H_HEADER_GUARD +#define BX_SIMDX4_H_HEADER_GUARD -#include "float4_t.h" +#include "simd_t.h" namespace bx { BX_ALIGN_DECL_16(struct) float4x4_t { - float4_t col[4]; + simd128_t col[4]; }; - BX_FLOAT4_FORCE_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t* _b) + BX_SIMD_FORCE_INLINE simd128_t simd_mul_xyz1(simd128_t _a, const float4x4_t* _b) { - const float4_t xxxx = float4_swiz_xxxx(_a); - const float4_t yyyy = float4_swiz_yyyy(_a); - const float4_t zzzz = float4_swiz_zzzz(_a); - const float4_t col0 = float4_mul(_b->col[0], xxxx); - const float4_t col1 = float4_mul(_b->col[1], yyyy); - const float4_t col2 = float4_madd(_b->col[2], zzzz, col0); - const float4_t col3 = float4_add(_b->col[3], col1); - const float4_t result = float4_add(col2, col3); + const simd128_t xxxx = simd_swiz_xxxx(_a); + const simd128_t yyyy = simd_swiz_yyyy(_a); + const simd128_t zzzz = simd_swiz_zzzz(_a); + const simd128_t col0 = simd_mul(_b->col[0], xxxx); + const simd128_t col1 = simd_mul(_b->col[1], yyyy); + const simd128_t col2 = simd_madd(_b->col[2], zzzz, col0); + const simd128_t col3 = simd_add(_b->col[3], col1); + const simd128_t result = simd_add(col2, col3); return result; } - BX_FLOAT4_FORCE_INLINE float4_t float4_mul(float4_t _a, const float4x4_t* _b) + BX_SIMD_FORCE_INLINE simd128_t simd_mul(simd128_t _a, const float4x4_t* _b) { - const float4_t xxxx = float4_swiz_xxxx(_a); - const float4_t yyyy = float4_swiz_yyyy(_a); - const float4_t zzzz = float4_swiz_zzzz(_a); - const float4_t wwww = float4_swiz_wwww(_a); - const float4_t col0 = float4_mul(_b->col[0], xxxx); - const float4_t col1 = float4_mul(_b->col[1], yyyy); - const float4_t col2 = float4_madd(_b->col[2], zzzz, col0); - const float4_t col3 = float4_madd(_b->col[3], wwww, col1); - const float4_t result = float4_add(col2, col3); + const simd128_t xxxx = simd_swiz_xxxx(_a); + const simd128_t yyyy = simd_swiz_yyyy(_a); + const simd128_t zzzz = simd_swiz_zzzz(_a); + const simd128_t wwww = simd_swiz_wwww(_a); + const simd128_t col0 = simd_mul(_b->col[0], xxxx); + const simd128_t col1 = simd_mul(_b->col[1], yyyy); + const simd128_t col2 = simd_madd(_b->col[2], zzzz, col0); + const simd128_t col3 = simd_madd(_b->col[3], wwww, col1); + const simd128_t result = simd_add(col2, col3); return result; } - BX_FLOAT4_INLINE void float4x4_mul(float4x4_t* __restrict _result, const float4x4_t* __restrict _a, const float4x4_t* __restrict _b) + BX_SIMD_INLINE void float4x4_mul(float4x4_t* __restrict _result, const float4x4_t* __restrict _a, const float4x4_t* __restrict _b) { - _result->col[0] = float4_mul(_a->col[0], _b); - _result->col[1] = float4_mul(_a->col[1], _b); - _result->col[2] = float4_mul(_a->col[2], _b); - _result->col[3] = float4_mul(_a->col[3], _b); + _result->col[0] = simd_mul(_a->col[0], _b); + _result->col[1] = simd_mul(_a->col[1], _b); + _result->col[2] = simd_mul(_a->col[2], _b); + _result->col[3] = simd_mul(_a->col[3], _b); } - BX_FLOAT4_FORCE_INLINE void float4x4_transpose(float4x4_t* __restrict _result, const float4x4_t* __restrict _mtx) + BX_SIMD_FORCE_INLINE void float4x4_transpose(float4x4_t* __restrict _result, const float4x4_t* __restrict _mtx) { - const float4_t aibj = float4_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj - const float4_t emfn = float4_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn - const float4_t ckdl = float4_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl - const float4_t gohp = float4_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp - _result->col[0] = float4_shuf_xAyB(aibj, emfn); // aeim - _result->col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn - _result->col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko - _result->col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp + const simd128_t aibj = simd_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj + const simd128_t emfn = simd_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn + const simd128_t ckdl = simd_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl + const simd128_t gohp = simd_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp + _result->col[0] = simd_shuf_xAyB(aibj, emfn); // aeim + _result->col[1] = simd_shuf_zCwD(aibj, emfn); // bfjn + _result->col[2] = simd_shuf_xAyB(ckdl, gohp); // cgko + _result->col[3] = simd_shuf_zCwD(ckdl, gohp); // dhlp } - BX_FLOAT4_INLINE void float4x4_inverse(float4x4_t* __restrict _result, const float4x4_t* __restrict _a) + BX_SIMD_INLINE void float4x4_inverse(float4x4_t* __restrict _result, const float4x4_t* __restrict _a) { - const float4_t tmp0 = float4_shuf_xAzC(_a->col[0], _a->col[1]); - const float4_t tmp1 = float4_shuf_xAzC(_a->col[2], _a->col[3]); - const float4_t tmp2 = float4_shuf_yBwD(_a->col[0], _a->col[1]); - const float4_t tmp3 = float4_shuf_yBwD(_a->col[2], _a->col[3]); - const float4_t t0 = float4_shuf_xyAB(tmp0, tmp1); - const float4_t t1 = float4_shuf_xyAB(tmp3, tmp2); - const float4_t t2 = float4_shuf_zwCD(tmp0, tmp1); - const float4_t t3 = float4_shuf_zwCD(tmp3, tmp2); + const simd128_t tmp0 = simd_shuf_xAzC(_a->col[0], _a->col[1]); + const simd128_t tmp1 = simd_shuf_xAzC(_a->col[2], _a->col[3]); + const simd128_t tmp2 = simd_shuf_yBwD(_a->col[0], _a->col[1]); + const simd128_t tmp3 = simd_shuf_yBwD(_a->col[2], _a->col[3]); + const simd128_t t0 = simd_shuf_xyAB(tmp0, tmp1); + const simd128_t t1 = simd_shuf_xyAB(tmp3, tmp2); + const simd128_t t2 = simd_shuf_zwCD(tmp0, tmp1); + const simd128_t t3 = simd_shuf_zwCD(tmp3, tmp2); - const float4_t t23 = float4_mul(t2, t3); - const float4_t t23_yxwz = float4_swiz_yxwz(t23); - const float4_t t23_wzyx = float4_swiz_wzyx(t23); + const simd128_t t23 = simd_mul(t2, t3); + const simd128_t t23_yxwz = simd_swiz_yxwz(t23); + const simd128_t t23_wzyx = simd_swiz_wzyx(t23); - float4_t cof0, cof1, cof2, cof3; + simd128_t cof0, cof1, cof2, cof3; - const float4_t zero = float4_zero(); - cof0 = float4_nmsub(t1, t23_yxwz, zero); - cof0 = float4_madd(t1, t23_wzyx, cof0); + const simd128_t zero = simd_zero(); + cof0 = simd_nmsub(t1, t23_yxwz, zero); + cof0 = simd_madd(t1, t23_wzyx, cof0); - cof1 = float4_nmsub(t0, t23_yxwz, zero); - cof1 = float4_madd(t0, t23_wzyx, cof1); - cof1 = float4_swiz_zwxy(cof1); - - const float4_t t12 = float4_mul(t1, t2); - const float4_t t12_yxwz = float4_swiz_yxwz(t12); - const float4_t t12_wzyx = float4_swiz_wzyx(t12); - - cof0 = float4_madd(t3, t12_yxwz, cof0); - cof0 = float4_nmsub(t3, t12_wzyx, cof0); + cof1 = simd_nmsub(t0, t23_yxwz, zero); + cof1 = simd_madd(t0, t23_wzyx, cof1); + cof1 = simd_swiz_zwxy(cof1); - cof3 = float4_mul(t0, t12_yxwz); - cof3 = float4_nmsub(t0, t12_wzyx, cof3); - cof3 = float4_swiz_zwxy(cof3); + const simd128_t t12 = simd_mul(t1, t2); + const simd128_t t12_yxwz = simd_swiz_yxwz(t12); + const simd128_t t12_wzyx = simd_swiz_wzyx(t12); - const float4_t t1_zwxy = float4_swiz_zwxy(t1); - const float4_t t2_zwxy = float4_swiz_zwxy(t2); + cof0 = simd_madd(t3, t12_yxwz, cof0); + cof0 = simd_nmsub(t3, t12_wzyx, cof0); - const float4_t t13 = float4_mul(t1_zwxy, t3); - const float4_t t13_yxwz = float4_swiz_yxwz(t13); - const float4_t t13_wzyx = float4_swiz_wzyx(t13); + cof3 = simd_mul(t0, t12_yxwz); + cof3 = simd_nmsub(t0, t12_wzyx, cof3); + cof3 = simd_swiz_zwxy(cof3); - cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0); - cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0); + const simd128_t t1_zwxy = simd_swiz_zwxy(t1); + const simd128_t t2_zwxy = simd_swiz_zwxy(t2); - cof2 = float4_mul(t0, t13_yxwz); - cof2 = float4_nmsub(t0, t13_wzyx, cof2); - cof2 = float4_swiz_zwxy(cof2); + const simd128_t t13 = simd_mul(t1_zwxy, t3); + const simd128_t t13_yxwz = simd_swiz_yxwz(t13); + const simd128_t t13_wzyx = simd_swiz_wzyx(t13); - const float4_t t01 = float4_mul(t0, t1); - const float4_t t01_yxwz = float4_swiz_yxwz(t01); - const float4_t t01_wzyx = float4_swiz_wzyx(t01); + cof0 = simd_madd(t2_zwxy, t13_yxwz, cof0); + cof0 = simd_nmsub(t2_zwxy, t13_wzyx, cof0); - cof2 = float4_nmsub(t3, t01_yxwz, cof2); - cof2 = float4_madd(t3, t01_wzyx, cof2); + cof2 = simd_mul(t0, t13_yxwz); + cof2 = simd_nmsub(t0, t13_wzyx, cof2); + cof2 = simd_swiz_zwxy(cof2); - cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3); - cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3); + const simd128_t t01 = simd_mul(t0, t1); + const simd128_t t01_yxwz = simd_swiz_yxwz(t01); + const simd128_t t01_wzyx = simd_swiz_wzyx(t01); - const float4_t t03 = float4_mul(t0, t3); - const float4_t t03_yxwz = float4_swiz_yxwz(t03); - const float4_t t03_wzyx = float4_swiz_wzyx(t03); + cof2 = simd_nmsub(t3, t01_yxwz, cof2); + cof2 = simd_madd(t3, t01_wzyx, cof2); - cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1); - cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1); + cof3 = simd_madd(t2_zwxy, t01_yxwz, cof3); + cof3 = simd_nmsub(t2_zwxy, t01_wzyx, cof3); - cof2 = float4_madd(t1, t03_yxwz, cof2); - cof2 = float4_nmsub(t1, t03_wzyx, cof2); + const simd128_t t03 = simd_mul(t0, t3); + const simd128_t t03_yxwz = simd_swiz_yxwz(t03); + const simd128_t t03_wzyx = simd_swiz_wzyx(t03); - const float4_t t02 = float4_mul(t0, t2_zwxy); - const float4_t t02_yxwz = float4_swiz_yxwz(t02); - const float4_t t02_wzyx = float4_swiz_wzyx(t02); + cof1 = simd_nmsub(t2_zwxy, t03_yxwz, cof1); + cof1 = simd_madd(t2_zwxy, t03_wzyx, cof1); - cof1 = float4_madd(t3, t02_yxwz, cof1); - cof1 = float4_nmsub(t3, t02_wzyx, cof1); + cof2 = simd_madd(t1, t03_yxwz, cof2); + cof2 = simd_nmsub(t1, t03_wzyx, cof2); - cof3 = float4_nmsub(t1, t02_yxwz, cof3); - cof3 = float4_madd(t1, t02_wzyx, cof3); + const simd128_t t02 = simd_mul(t0, t2_zwxy); + const simd128_t t02_yxwz = simd_swiz_yxwz(t02); + const simd128_t t02_wzyx = simd_swiz_wzyx(t02); - const float4_t det = float4_dot(t0, cof0); - const float4_t invdet = float4_rcp(det); + cof1 = simd_madd(t3, t02_yxwz, cof1); + cof1 = simd_nmsub(t3, t02_wzyx, cof1); - _result->col[0] = float4_mul(cof0, invdet); - _result->col[1] = float4_mul(cof1, invdet); - _result->col[2] = float4_mul(cof2, invdet); - _result->col[3] = float4_mul(cof3, invdet); + cof3 = simd_nmsub(t1, t02_yxwz, cof3); + cof3 = simd_madd(t1, t02_wzyx, cof3); + + const simd128_t det = simd_dot(t0, cof0); + const simd128_t invdet = simd_rcp(det); + + _result->col[0] = simd_mul(cof0, invdet); + _result->col[1] = simd_mul(cof1, invdet); + _result->col[2] = simd_mul(cof2, invdet); + _result->col[3] = simd_mul(cof3, invdet); } } // namespace bx -#endif // BX_FLOAT4X4_H_HEADER_GUARD +#endif // BX_SIMDX4_H_HEADER_GUARD diff --git a/include/bx/float4_langext.h b/include/bx/simd128_langext.inl similarity index 53% rename from include/bx/float4_langext.h rename to include/bx/simd128_langext.inl index 1e1c2ba..4557a56 100644 --- a/include/bx/float4_langext.h +++ b/include/bx/simd128_langext.inl @@ -3,8 +3,8 @@ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause */ -#ifndef BX_FLOAT4_LANGEXT_H_HEADER_GUARD -#define BX_FLOAT4_LANGEXT_H_HEADER_GUARD +#ifndef BX_SIMD_LANGEXT_H_HEADER_GUARD +#define BX_SIMD_LANGEXT_H_HEADER_GUARD #define float4_rcp float4_rcp_ni #define float4_orx float4_orx_ni @@ -37,7 +37,8 @@ #define float4_max float4_max_ni #define float4_imin float4_imin_ni #define float4_imax float4_imax_ni -#include "float4_ni.h" + +#include "simd_ni.inl" namespace bx { @@ -47,9 +48,9 @@ namespace bx #define ELEMw 3 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ template<> \ - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_swiz_##_x##_y##_z##_w(float4_langext_t _a) \ + BX_SIMD_FORCE_INLINE simd_langext_t float4_swiz_##_x##_y##_z##_w(simd_langext_t _a) \ { \ - float4_langext_t result; \ + simd_langext_t result; \ result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ return result; \ } @@ -64,7 +65,7 @@ namespace bx #define IMPLEMENT_TEST(_xyzw, _mask) \ template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_langext_t _test) \ + BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd_langext_t _test) \ { \ uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ | ( (_test.uxyzw[2]>>31)<<2) \ @@ -75,7 +76,7 @@ namespace bx } \ \ template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_langext_t _test) \ + BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd_langext_t _test) \ { \ uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ | ( (_test.uxyzw[2]>>31)<<2) \ @@ -104,114 +105,114 @@ IMPLEMENT_TEST(xyzw , 0xf); #undef IMPLEMENT_TEST template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xyAB(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xyAB(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_ABxy(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_ABxy(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_CDzw(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_CDzw(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 7, 2, 3); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_zwCD(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_zwCD(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 6, 7); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xAyB(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xAyB(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_yBxA(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_yBxA(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_zCwD(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_zCwD(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_CzDw(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_CzDw(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_xAzC(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_xAzC(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_shuf_yBwD(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_shuf_yBwD(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7); return result; } template<> - BX_FLOAT4_FORCE_INLINE float float4_x(float4_langext_t _a) + BX_SIMD_FORCE_INLINE float float4_x(simd_langext_t _a) { return _a.fxyzw[0]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_y(float4_langext_t _a) + BX_SIMD_FORCE_INLINE float float4_y(simd_langext_t _a) { return _a.fxyzw[1]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_z(float4_langext_t _a) + BX_SIMD_FORCE_INLINE float float4_z(simd_langext_t _a) { return _a.fxyzw[2]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_w(float4_langext_t _a) + BX_SIMD_FORCE_INLINE float float4_w(simd_langext_t _a) { return _a.fxyzw[3]; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd_langext_t float4_ld(const void* _ptr) { const uint32_t* input = reinterpret_cast(_ptr); - float4_langext_t result; + simd_langext_t result; result.uxyzw[0] = input[0]; result.uxyzw[1] = input[1]; result.uxyzw[2] = input[2]; @@ -220,7 +221,7 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_langext_t _a) + BX_SIMD_FORCE_INLINE void float4_st(void* _ptr, simd_langext_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; @@ -230,14 +231,14 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_langext_t _a) + BX_SIMD_FORCE_INLINE void float4_stx(void* _ptr, simd_langext_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; } template<> - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_langext_t _a) + BX_SIMD_FORCE_INLINE void float4_stream(void* _ptr, simd_langext_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; @@ -247,109 +248,109 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ld(float _x, float _y, float _z, float _w) + BX_SIMD_FORCE_INLINE simd_langext_t float4_ld(float _x, float _y, float _z, float _w) { - float4_langext_t result; + simd_langext_t result; result.vf = (float __attribute__((vector_size(16)))){ _x, _y, _z, _w }; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + BX_SIMD_FORCE_INLINE simd_langext_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { - float4_langext_t result; + simd_langext_t result; result.vu = (uint32_t __attribute__((vector_size(16)))){ _x, _y, _z, _w }; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_splat(const void* _ptr) + BX_SIMD_FORCE_INLINE simd_langext_t float4_splat(const void* _ptr) { const uint32_t val = *reinterpret_cast(_ptr); - float4_langext_t result; + simd_langext_t result; result.vu = (uint32_t __attribute__((vector_size(16)))){ val, val, val, val }; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_splat(float _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_splat(float _a) { - return float4_ld(_a, _a, _a, _a); + return float4_ld(_a, _a, _a, _a); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_isplat(uint32_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_isplat(uint32_t _a) { - return float4_ild(_a, _a, _a, _a); + return float4_ild(_a, _a, _a, _a); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_zero() + BX_SIMD_FORCE_INLINE simd_langext_t float4_zero() { - return float4_ild(0, 0, 0, 0); + return float4_ild(0, 0, 0, 0); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_itof(float4_langext_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_itof(simd_langext_t _a) { - float4_langext_t result; + simd_langext_t result; result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) ); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_ftoi(float4_langext_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_ftoi(simd_langext_t _a) { - float4_langext_t result; + simd_langext_t result; result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) ); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_round(float4_langext_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_round(simd_langext_t _a) { - const float4_langext_t tmp = float4_ftoi(_a); - const float4_langext_t result = float4_itof(tmp); + const simd_langext_t tmp = float4_ftoi(_a); + const simd_langext_t result = float4_itof(tmp); return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_add(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_add(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = _a.vf + _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sub(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_sub(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = _a.vf - _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_mul(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_mul(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = _a.vf * _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_div(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_div(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vf = _a.vf / _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sqrt(float4_langext_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_sqrt(simd_langext_t _a) { - float4_langext_t result; + simd_langext_t result; result.vf[0] = sqrtf(_a.vf[0]); result.vf[1] = sqrtf(_a.vf[1]); result.vf[2] = sqrtf(_a.vf[2]); @@ -358,9 +359,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_rsqrt_est(float4_langext_t _a) + BX_SIMD_FORCE_INLINE simd_langext_t float4_rsqrt_est(simd_langext_t _a) { - float4_langext_t result; + simd_langext_t result; result.vf[0] = 1.0f / sqrtf(_a.vf[0]); result.vf[1] = 1.0f / sqrtf(_a.vf[1]); result.vf[2] = 1.0f / sqrtf(_a.vf[2]); @@ -369,146 +370,146 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpeq(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpeq(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vf == _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmplt(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_cmplt(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vf < _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmple(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_cmple(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vf <= _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpgt(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpgt(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vf > _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_cmpge(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_cmpge(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vf >= _b.vf; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_and(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_and(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vu = _a.vu & _b.vu; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_andc(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_andc(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vu = _a.vu & ~_b.vu; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_or(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_or(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vu = _a.vu | _b.vu; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_xor(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_xor(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vu = _a.vu ^ _b.vu; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sll(float4_langext_t _a, int _count) + BX_SIMD_FORCE_INLINE simd_langext_t float4_sll(simd_langext_t _a, int _count) { - float4_langext_t result; - const float4_langext_t count = float4_isplat(_count); + simd_langext_t result; + const simd_langext_t count = float4_isplat(_count); result.vu = _a.vu << count.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_srl(float4_langext_t _a, int _count) + BX_SIMD_FORCE_INLINE simd_langext_t float4_srl(simd_langext_t _a, int _count) { - float4_langext_t result; - const float4_langext_t count = float4_isplat(_count); + simd_langext_t result; + const simd_langext_t count = float4_isplat(_count); result.vu = _a.vu >> count.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_sra(float4_langext_t _a, int _count) + BX_SIMD_FORCE_INLINE simd_langext_t float4_sra(simd_langext_t _a, int _count) { - float4_langext_t result; - const float4_langext_t count = float4_isplat(_count); + simd_langext_t result; + const simd_langext_t count = float4_isplat(_count); result.vi = _a.vi >> count.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmpeq(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_icmpeq(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vi == _b.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmplt(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_icmplt(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vi < _b.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_icmpgt(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_icmpgt(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vi > _b.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_iadd(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_iadd(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vi + _b.vi; return result; } template<> - BX_FLOAT4_FORCE_INLINE float4_langext_t float4_isub(float4_langext_t _a, float4_langext_t _b) + BX_SIMD_FORCE_INLINE simd_langext_t float4_isub(simd_langext_t _a, simd_langext_t _b) { - float4_langext_t result; + simd_langext_t result; result.vi = _a.vi - _b.vi; return result; } - typedef float4_langext_t float4_t; + typedef simd_langext_t simd128_t; } // namespace bx -#endif // BX_FLOAT4_LANGEXT_H_HEADER_GUARD +#endif // BX_SIMD_LANGEXT_H_HEADER_GUARD diff --git a/include/bx/simd128_neon.inl b/include/bx/simd128_neon.inl new file mode 100644 index 0000000..4f29bc3 --- /dev/null +++ b/include/bx/simd128_neon.inl @@ -0,0 +1,562 @@ +/* + * Copyright 2010-2016 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx#license-bsd-2-clause + */ + +#ifndef BX_SIMD_NEON_H_HEADER_GUARD +#define BX_SIMD_NEON_H_HEADER_GUARD + +#define simd_rcp simd_rcp_ni +#define simd_orx simd_orx_ni +#define simd_orc simd_orc_ni +#define simd_neg simd_neg_ni +#define simd_madd simd_madd_ni +#define simd_nmsub simd_nmsub_ni +#define simd_div_nr simd_div_nr_ni +#define simd_div simd_div_nr_ni +#define simd_selb simd_selb_ni +#define simd_sels simd_sels_ni +#define simd_not simd_not_ni +#define simd_abs simd_abs_ni +#define simd_clamp simd_clamp_ni +#define simd_lerp simd_lerp_ni +#define simd_rsqrt simd_rsqrt_ni +#define simd_rsqrt_nr simd_rsqrt_nr_ni +#define simd_rsqrt_carmack simd_rsqrt_carmack_ni +#define simd_sqrt_nr simd_sqrt_nr_ni +#define simd_sqrt simd_sqrt_nr_ni +#define simd_log2 simd_log2_ni +#define simd_exp2 simd_exp2_ni +#define simd_pow simd_pow_ni +#define simd_cross3 simd_cross3_ni +#define simd_normalize3 simd_normalize3_ni +#define simd_dot3 simd_dot3_ni +#define simd_dot simd_dot_ni +#define simd_ceil simd_ceil_ni +#define simd_floor simd_floor_ni + +#include "simd_ni.inl" + +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \ + { \ + return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \ + } + +#include "simd_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _swizzle) \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_neon_t _test) \ + { \ + const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \ + return simd_test_any_ni(tmp0); \ + } \ + \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_neon_t _test) \ + { \ + const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \ + return simd_test_all_ni(tmp0); \ + } + +IMPLEMENT_TEST(x, xxxx); +IMPLEMENT_TEST(y, yyyy); +IMPLEMENT_TEST(xy, xyyy); +IMPLEMENT_TEST(z, zzzz); +IMPLEMENT_TEST(xz, xzzz); +IMPLEMENT_TEST(yz, yzzz); +IMPLEMENT_TEST(xyz, xyzz); +IMPLEMENT_TEST(w, wwww); +IMPLEMENT_TEST(xw, xwww); +IMPLEMENT_TEST(yw, ywww); +IMPLEMENT_TEST(xyw, xyww); +IMPLEMENT_TEST(zw, zwww); +IMPLEMENT_TEST(xzw, xzww); +IMPLEMENT_TEST(yzw, yzww); +#undef IMPLEMENT_TEST + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_any_xyzw(simd128_neon_t _test) + { + return simd_test_any_ni(_test); + } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_all_xyzw(simd128_neon_t _test) + { + return simd_test_all_ni(_test); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_yBxA(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 }); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b) + { + return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 }); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a) + { + return vgetq_lane_f32(_a, 0); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_y(simd128_neon_t _a) + { + return vgetq_lane_f32(_a, 1); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_z(simd128_neon_t _a) + { + return vgetq_lane_f32(_a, 2); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_w(simd128_neon_t _a) + { + return vgetq_lane_f32(_a, 3); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(const void* _ptr) + { + return vld1q_f32( (const float32_t*)_ptr); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_neon_t _a) + { + vst1q_f32( (float32_t*)_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_neon_t _a) + { + vst1q_lane_f32( (float32_t*)_ptr, _a, 0); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_neon_t _a) + { + vst1q_f32( (float32_t*)_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(float _x, float _y, float _z, float _w) + { + const float32_t val[4] = {_x, _y, _z, _w}; + return simd_ld(val); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + const uint32_t val[4] = {_x, _y, _z, _w}; + const uint32x4_t tmp = vld1q_u32(val); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(const void* _ptr) + { + const simd128_neon_t tmp0 = vld1q_f32( (const float32_t*)_ptr); + const float32x2_t tmp1 = vget_low_f32(tmp0); + const simd128_neon_t result = vdupq_lane_f32(tmp1, 0); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(float _a) + { + return vdupq_n_f32(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_isplat(uint32_t _a) + { + const int32x4_t tmp = vdupq_n_s32(_a); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_zero() + { + return simd_isplat(0); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_itof(simd128_neon_t _a) + { + const int32x4_t itof = vreinterpretq_s32_f32(_a); + const simd128_neon_t result = vcvtq_f32_s32(itof); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_ftoi(simd128_neon_t _a) + { + const int32x4_t ftoi = vcvtq_s32_f32(_a); + const simd128_neon_t result = vreinterpretq_f32_s32(ftoi); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_add(simd128_neon_t _a, simd128_neon_t _b) + { + return vaddq_f32(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_sub(simd128_neon_t _a, simd128_neon_t _b) + { + return vsubq_f32(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_mul(simd128_neon_t _a, simd128_neon_t _b) + { + return vmulq_f32(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_rcp_est(simd128_neon_t _a) + { + return vrecpeq_f32(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt_est(simd128_neon_t _a) + { + return vrsqrteq_f32(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t tmp = vceqq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t tmp = vcltq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t tmp = vcleq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t tmp = vcgtq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t tmp = vcgeq_f32(_a, _b); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_min(simd128_neon_t _a, simd128_neon_t _b) + { + return vminq_f32(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_max(simd128_neon_t _a, simd128_neon_t _b) + { + return vmaxq_f32(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_and(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_andc(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_or(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_xor(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count) + { + if (__builtin_constant_p(_count) ) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp1); + + return result; + } + + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t shift = vdupq_n_s32(-_count); + const int32x4_t tmp1 = vshlq_s32(tmp0, shift); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp1); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + return vmlaq_f32(_c, _a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + return vmlsq_f32(_c, _a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmplt(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_imin(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_imax(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_iadd(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd_isub(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); + const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); + const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); + + return result; + } + + template<> + BX_SIMD_INLINE simd128_neon_t simd_shuf_xAzC(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_shuf_xAzC_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_neon_t simd_shuf_yBwD(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_shuf_yBwD_ni(_a, _b); + } + + typedef simd128_neon_t simd128_t; + +} // namespace bx + +#endif // BX_SIMD_NEON_H_HEADER_GUARD diff --git a/include/bx/float4_ref.h b/include/bx/simd128_ref.inl similarity index 64% rename from include/bx/float4_ref.h rename to include/bx/simd128_ref.inl index 0253d65..85f54e8 100644 --- a/include/bx/float4_ref.h +++ b/include/bx/simd128_ref.inl @@ -3,41 +3,41 @@ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause */ -#ifndef BX_FLOAT4_REF_H_HEADER_GUARD -#define BX_FLOAT4_REF_H_HEADER_GUARD +#ifndef BX_SIMD_REF_H_HEADER_GUARD +#define BX_SIMD_REF_H_HEADER_GUARD #include // sqrtf -#define float4_shuf_xAzC float4_shuf_xAzC_ni -#define float4_shuf_yBwD float4_shuf_yBwD_ni -#define float4_rcp float4_rcp_ni -#define float4_orx float4_orx_ni -#define float4_orc float4_orc_ni -#define float4_neg float4_neg_ni -#define float4_madd float4_madd_ni -#define float4_nmsub float4_nmsub_ni -#define float4_div_nr float4_div_nr_ni -#define float4_selb float4_selb_ni -#define float4_sels float4_sels_ni -#define float4_not float4_not_ni -#define float4_abs float4_abs_ni -#define float4_clamp float4_clamp_ni -#define float4_lerp float4_lerp_ni -#define float4_rsqrt float4_rsqrt_ni -#define float4_rsqrt_nr float4_rsqrt_nr_ni -#define float4_rsqrt_carmack float4_rsqrt_carmack_ni -#define float4_sqrt_nr float4_sqrt_nr_ni -#define float4_log2 float4_log2_ni -#define float4_exp2 float4_exp2_ni -#define float4_pow float4_pow_ni -#define float4_cross3 float4_cross3_ni -#define float4_normalize3 float4_normalize3_ni -#define float4_dot3 float4_dot3_ni -#define float4_dot float4_dot_ni -#define float4_ceil float4_ceil_ni -#define float4_floor float4_floor_ni +#define simd_shuf_xAzC simd_shuf_xAzC_ni +#define simd_shuf_yBwD simd_shuf_yBwD_ni +#define simd_rcp simd_rcp_ni +#define simd_orx simd_orx_ni +#define simd_orc simd_orc_ni +#define simd_neg simd_neg_ni +#define simd_madd simd_madd_ni +#define simd_nmsub simd_nmsub_ni +#define simd_div_nr simd_div_nr_ni +#define simd_selb simd_selb_ni +#define simd_sels simd_sels_ni +#define simd_not simd_not_ni +#define simd_abs simd_abs_ni +#define simd_clamp simd_clamp_ni +#define simd_lerp simd_lerp_ni +#define simd_rsqrt simd_rsqrt_ni +#define simd_rsqrt_nr simd_rsqrt_nr_ni +#define simd_rsqrt_carmack simd_rsqrt_carmack_ni +#define simd_sqrt_nr simd_sqrt_nr_ni +#define simd_log2 simd_log2_ni +#define simd_exp2 simd_exp2_ni +#define simd_pow simd_pow_ni +#define simd_cross3 simd_cross3_ni +#define simd_normalize3 simd_normalize3_ni +#define simd_dot3 simd_dot3_ni +#define simd_dot simd_dot_ni +#define simd_ceil simd_ceil_ni +#define simd_floor simd_floor_ni -#include "float4_ni.h" +#include "simd_ni.inl" namespace bx { @@ -47,9 +47,9 @@ namespace bx #define ELEMw 3 #define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ template<> \ - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_swiz_##_x##_y##_z##_w(float4_ref_t _a) \ + BX_SIMD_FORCE_INLINE simd128_ref_t simd_swiz_##_x##_y##_z##_w(simd128_ref_t _a) \ { \ - float4_ref_t result; \ + simd128_ref_t result; \ result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ @@ -57,7 +57,7 @@ namespace bx return result; \ } -#include "float4_swizzle.inl" +#include "simd_swizzle.inl" #undef IMPLEMENT_SWIZZLE #undef ELEMw @@ -67,7 +67,7 @@ namespace bx #define IMPLEMENT_TEST(_xyzw, _mask) \ template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_any_##_xyzw(float4_ref_t _test) \ + BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_ref_t _test) \ { \ uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ | ( (_test.uxyzw[2]>>31)<<2) \ @@ -78,7 +78,7 @@ namespace bx } \ \ template<> \ - BX_FLOAT4_FORCE_INLINE bool float4_test_all_##_xyzw(float4_ref_t _test) \ + BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_ref_t _test) \ { \ uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ | ( (_test.uxyzw[2]>>31)<<2) \ @@ -107,9 +107,9 @@ IMPLEMENT_TEST(xyzw , 0xf); #undef IMPLEMENT_TEST template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xyAB(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xyAB(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0]; result.uxyzw[1] = _a.uxyzw[1]; result.uxyzw[2] = _b.uxyzw[0]; @@ -118,9 +118,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_ABxy(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_ABxy(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _b.uxyzw[0]; result.uxyzw[1] = _b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[0]; @@ -129,9 +129,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CDzw(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CDzw(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _b.uxyzw[2]; result.uxyzw[1] = _b.uxyzw[3]; result.uxyzw[2] = _a.uxyzw[2]; @@ -140,9 +140,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zwCD(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zwCD(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[2]; result.uxyzw[1] = _a.uxyzw[3]; result.uxyzw[2] = _b.uxyzw[2]; @@ -151,9 +151,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_xAyB(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAyB(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0]; result.uxyzw[1] = _b.uxyzw[0]; result.uxyzw[2] = _a.uxyzw[1]; @@ -162,9 +162,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_yBxA(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_yBxA(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[1]; result.uxyzw[1] = _b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[0]; @@ -173,9 +173,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_zCwD(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zCwD(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[2]; result.uxyzw[1] = _b.uxyzw[2]; result.uxyzw[2] = _a.uxyzw[3]; @@ -184,9 +184,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_shuf_CzDw(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CzDw(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _b.uxyzw[2]; result.uxyzw[1] = _a.uxyzw[2]; result.uxyzw[2] = _b.uxyzw[3]; @@ -195,34 +195,34 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float float4_x(float4_ref_t _a) + BX_SIMD_FORCE_INLINE float simd_x(simd128_ref_t _a) { return _a.fxyzw[0]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_y(float4_ref_t _a) + BX_SIMD_FORCE_INLINE float simd_y(simd128_ref_t _a) { return _a.fxyzw[1]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_z(float4_ref_t _a) + BX_SIMD_FORCE_INLINE float simd_z(simd128_ref_t _a) { return _a.fxyzw[2]; } template<> - BX_FLOAT4_FORCE_INLINE float float4_w(float4_ref_t _a) + BX_SIMD_FORCE_INLINE float simd_w(simd128_ref_t _a) { return _a.fxyzw[3]; } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(const void* _ptr) { const uint32_t* input = reinterpret_cast(_ptr); - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = input[0]; result.uxyzw[1] = input[1]; result.uxyzw[2] = input[2]; @@ -231,7 +231,7 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE void float4_st(void* _ptr, float4_ref_t _a) + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_ref_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; @@ -241,14 +241,14 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE void float4_stx(void* _ptr, float4_ref_t _a) + BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_ref_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; } template<> - BX_FLOAT4_FORCE_INLINE void float4_stream(void* _ptr, float4_ref_t _a) + BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_ref_t _a) { uint32_t* result = reinterpret_cast(_ptr); result[0] = _a.uxyzw[0]; @@ -258,9 +258,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ld(float _x, float _y, float _z, float _w) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(float _x, float _y, float _z, float _w) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _x; result.fxyzw[1] = _y; result.fxyzw[2] = _z; @@ -269,9 +269,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _x; result.uxyzw[1] = _y; result.uxyzw[2] = _z; @@ -280,10 +280,10 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(const void* _ptr) { const uint32_t val = *reinterpret_cast(_ptr); - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = val; result.uxyzw[1] = val; result.uxyzw[2] = val; @@ -292,27 +292,27 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_splat(float _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(float _a) { - return float4_ld(_a, _a, _a, _a); + return simd_ld(_a, _a, _a, _a); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isplat(uint32_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_isplat(uint32_t _a) { - return float4_ild(_a, _a, _a, _a); + return simd_ild(_a, _a, _a, _a); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_zero() + BX_SIMD_FORCE_INLINE simd128_ref_t simd_zero() { - return float4_ild(0, 0, 0, 0); + return simd_ild(0, 0, 0, 0); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_itof(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_itof(simd128_ref_t _a) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = (float)_a.ixyzw[0]; result.fxyzw[1] = (float)_a.ixyzw[1]; result.fxyzw[2] = (float)_a.ixyzw[2]; @@ -321,9 +321,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_ftoi(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_ftoi(simd128_ref_t _a) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = (int)_a.fxyzw[0]; result.ixyzw[1] = (int)_a.fxyzw[1]; result.ixyzw[2] = (int)_a.fxyzw[2]; @@ -332,15 +332,15 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_round(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_round(simd128_ref_t _a) { - return float4_round_ni(_a); + return simd_round_ni(_a); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_add(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_add(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2]; @@ -349,9 +349,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sub(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_sub(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2]; @@ -360,9 +360,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_mul(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_mul(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; @@ -371,9 +371,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_div(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_div(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2]; @@ -382,9 +382,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rcp_est(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp_est(simd128_ref_t _a) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = 1.0f / _a.fxyzw[0]; result.fxyzw[1] = 1.0f / _a.fxyzw[1]; result.fxyzw[2] = 1.0f / _a.fxyzw[2]; @@ -393,9 +393,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sqrt(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt(simd128_ref_t _a) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = sqrtf(_a.fxyzw[0]); result.fxyzw[1] = sqrtf(_a.fxyzw[1]); result.fxyzw[2] = sqrtf(_a.fxyzw[2]); @@ -404,9 +404,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_rsqrt_est(float4_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_est(simd128_ref_t _a) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]); result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]); result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]); @@ -415,9 +415,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpeq(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpeq(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0; @@ -426,9 +426,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmplt(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0; @@ -437,9 +437,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmple(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmple(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0; @@ -448,9 +448,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpgt(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpgt(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0; @@ -459,9 +459,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_cmpge(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpge(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0; @@ -470,9 +470,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_min(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_min(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; @@ -481,9 +481,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_max(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_max(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; @@ -492,9 +492,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_and(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_and(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0]; result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2]; @@ -503,9 +503,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_andc(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_andc(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0]; result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2]; @@ -514,9 +514,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_or(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_or(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0]; result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2]; @@ -525,9 +525,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_xor(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_xor(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0]; result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1]; result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2]; @@ -536,9 +536,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sll(float4_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_sll(simd128_ref_t _a, int _count) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] << _count; result.uxyzw[1] = _a.uxyzw[1] << _count; result.uxyzw[2] = _a.uxyzw[2] << _count; @@ -547,9 +547,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_srl(float4_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_srl(simd128_ref_t _a, int _count) { - float4_ref_t result; + simd128_ref_t result; result.uxyzw[0] = _a.uxyzw[0] >> _count; result.uxyzw[1] = _a.uxyzw[1] >> _count; result.uxyzw[2] = _a.uxyzw[2] >> _count; @@ -558,9 +558,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_sra(float4_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_sra(simd128_ref_t _a, int _count) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] >> _count; result.ixyzw[1] = _a.ixyzw[1] >> _count; result.ixyzw[2] = _a.ixyzw[2] >> _count; @@ -569,9 +569,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpeq(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpeq(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0; @@ -580,9 +580,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmplt(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmplt(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0; @@ -591,9 +591,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_icmpgt(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpgt(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0; result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0; result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0; @@ -602,9 +602,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imin(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_imin(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; @@ -613,9 +613,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_imax(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_imax(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; @@ -624,9 +624,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_iadd(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_iadd(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0]; result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1]; result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2]; @@ -635,9 +635,9 @@ IMPLEMENT_TEST(xyzw , 0xf); } template<> - BX_FLOAT4_FORCE_INLINE float4_ref_t float4_isub(float4_ref_t _a, float4_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd_isub(simd128_ref_t _a, simd128_ref_t _b) { - float4_ref_t result; + simd128_ref_t result; result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0]; result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1]; result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2]; @@ -647,4 +647,4 @@ IMPLEMENT_TEST(xyzw , 0xf); } // namespace bx -#endif // BX_FLOAT4_REF_H_HEADER_GUARD +#endif // BX_SIMD_REF_H_HEADER_GUARD diff --git a/include/bx/simd128_sse.inl b/include/bx/simd128_sse.inl new file mode 100644 index 0000000..f68ea26 --- /dev/null +++ b/include/bx/simd128_sse.inl @@ -0,0 +1,647 @@ +/* + * Copyright 2010-2016 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx#license-bsd-2-clause + */ + +#ifndef BX_SIMD_SSE_H_HEADER_GUARD +#define BX_SIMD_SSE_H_HEADER_GUARD + +#include "simd_ni.inl" + +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_SIMD_FORCE_INLINE simd128_sse_t simd_swiz_##_x##_y##_z##_w(simd128_sse_t _a) \ + { \ + return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \ + } + +#include "simd_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_sse_t _test) \ + { \ + return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ + } \ + \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_sse_t _test) \ + { \ + return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ + } + +IMPLEMENT_TEST(x , 0x1); +IMPLEMENT_TEST(y , 0x2); +IMPLEMENT_TEST(xy , 0x3); +IMPLEMENT_TEST(z , 0x4); +IMPLEMENT_TEST(xz , 0x5); +IMPLEMENT_TEST(yz , 0x6); +IMPLEMENT_TEST(xyz , 0x7); +IMPLEMENT_TEST(w , 0x8); +IMPLEMENT_TEST(xw , 0x9); +IMPLEMENT_TEST(yw , 0xa); +IMPLEMENT_TEST(xyw , 0xb); +IMPLEMENT_TEST(zw , 0xc); +IMPLEMENT_TEST(xzw , 0xd); +IMPLEMENT_TEST(yzw , 0xe); +IMPLEMENT_TEST(xyzw , 0xf); + +#undef IMPLEMENT_TEST + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xyAB(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_movelh_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_ABxy(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_movelh_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CDzw(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_movehl_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zwCD(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_movehl_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xAyB(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_unpacklo_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_yBxA(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_unpacklo_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zCwD(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_unpackhi_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CzDw(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_unpackhi_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_x(simd128_sse_t _a) + { + return _mm_cvtss_f32(_a); + } + + template<> + BX_SIMD_FORCE_INLINE float simd_y(simd128_sse_t _a) + { + const simd128_sse_t yyyy = simd_swiz_yyyy(_a); + const float result = _mm_cvtss_f32(yyyy); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE float simd_z(simd128_sse_t _a) + { + const simd128_sse_t zzzz = simd_swiz_zzzz(_a); + const float result = _mm_cvtss_f32(zzzz); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE float simd_w(simd128_sse_t _a) + { + const simd128_sse_t wwww = simd_swiz_wwww(_a); + const float result = _mm_cvtss_f32(wwww); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(const void* _ptr) + { + return _mm_load_ps(reinterpret_cast(_ptr) ); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_sse_t _a) + { + _mm_store_ps(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_sse_t _a) + { + _mm_store_ss(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_sse_t _a) + { + _mm_stream_ps(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(float _x, float _y, float _z, float _w) + { + return _mm_set_ps(_w, _z, _y, _x); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + const __m128i set = _mm_set_epi32(_w, _z, _y, _x); + const simd128_sse_t result = _mm_castsi128_ps(set); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(const void* _ptr) + { + const simd128_sse_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); + const simd128_sse_t result = simd_swiz_xxxx(x___); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(float _a) + { + return _mm_set1_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_isplat(uint32_t _a) + { + const __m128i splat = _mm_set1_epi32(_a); + const simd128_sse_t result = _mm_castsi128_ps(splat); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_zero() + { + return _mm_setzero_ps(); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_itof(simd128_sse_t _a) + { + const __m128i itof = _mm_castps_si128(_a); + const simd128_sse_t result = _mm_cvtepi32_ps(itof); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_ftoi(simd128_sse_t _a) + { + const __m128i ftoi = _mm_cvtps_epi32(_a); + const simd128_sse_t result = _mm_castsi128_ps(ftoi); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_round(simd128_sse_t _a) + { +#if defined(__SSE4_1__) + return _mm_round_ps(_a, _MM_FROUND_NINT); +#else + const __m128i round = _mm_cvtps_epi32(_a); + const simd128_sse_t result = _mm_cvtepi32_ps(round); + + return result; +#endif // defined(__SSE4_1__) + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_add(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_add_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_sub(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_sub_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_mul(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_mul_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_div(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_div_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_rcp_est(simd128_sse_t _a) + { + return _mm_rcp_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_sqrt(simd128_sse_t _a) + { + return _mm_sqrt_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_rsqrt_est(simd128_sse_t _a) + { + return _mm_rsqrt_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot3(simd128_sse_t _a, simd128_sse_t _b) + { +#if defined(__SSE4_1__) + return _mm_dp_ps(_a, _b, 0x77); +#else + return simd_dot3_ni(_a, _b); +#endif // defined(__SSE4__) + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot(simd128_sse_t _a, simd128_sse_t _b) + { +#if defined(__SSE4_1__) + return _mm_dp_ps(_a, _b, 0xFF); +#else + return simd_dot_ni(_a, _b); +#endif // defined(__SSE4__) + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpeq_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmplt_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmple(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmple_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpgt_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpge(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpge_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_min(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_min_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_max(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_max_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_and(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_and_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_andc(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_andnot_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_or(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_or_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_xor(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_xor_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_sll(simd128_sse_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_slli_epi32(a, _count); + const simd128_sse_t result = _mm_castsi128_ps(shift); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_srl(simd128_sse_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_srli_epi32(a, _count); + const simd128_sse_t result = _mm_castsi128_ps(shift); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_sra(simd128_sse_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_srai_epi32(a, _count); + const simd128_sse_t result = _mm_castsi128_ps(shift); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i tmp0 = _mm_castps_si128(_a); + const __m128i tmp1 = _mm_castps_si128(_b); + const __m128i tmp2 = _mm_cmpeq_epi32(tmp0, tmp1); + const simd128_sse_t result = _mm_castsi128_ps(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmplt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i tmp0 = _mm_castps_si128(_a); + const __m128i tmp1 = _mm_castps_si128(_b); + const __m128i tmp2 = _mm_cmplt_epi32(tmp0, tmp1); + const simd128_sse_t result = _mm_castsi128_ps(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i tmp0 = _mm_castps_si128(_a); + const __m128i tmp1 = _mm_castps_si128(_b); + const __m128i tmp2 = _mm_cmpgt_epi32(tmp0, tmp1); + const simd128_sse_t result = _mm_castsi128_ps(tmp2); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_imin(simd128_sse_t _a, simd128_sse_t _b) + { +#if defined(__SSE4_1__) + const __m128i tmp0 = _mm_castps_si128(_a); + const __m128i tmp1 = _mm_castps_si128(_b); + const __m128i tmp2 = _mm_min_epi32(tmp0, tmp1); + const simd128_sse_t result = _mm_castsi128_ps(tmp2); + + return result; +#else + return simd_imin_ni(_a, _b); +#endif // defined(__SSE4_1__) + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_imax(simd128_sse_t _a, simd128_sse_t _b) + { +#if defined(__SSE4_1__) + const __m128i tmp0 = _mm_castps_si128(_a); + const __m128i tmp1 = _mm_castps_si128(_b); + const __m128i tmp2 = _mm_max_epi32(tmp0, tmp1); + const simd128_sse_t result = _mm_castsi128_ps(tmp2); + + return result; +#else + return simd_imax_ni(_a, _b); +#endif // defined(__SSE4_1__) + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_iadd(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i add = _mm_add_epi32(a, b); + const simd128_sse_t result = _mm_castsi128_ps(add); + + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd_isub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i sub = _mm_sub_epi32(a, b); + const simd128_sse_t result = _mm_castsi128_ps(sub); + + return result; + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_shuf_xAzC(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_shuf_xAzC_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_shuf_yBwD(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_shuf_yBwD_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_rcp(simd128_sse_t _a) + { + return simd_rcp_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_orx(simd128_sse_t _a) + { + return simd_orx_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_orc(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_orc_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_neg(simd128_sse_t _a) + { + return simd_neg_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_madd(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) + { + return simd_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_nmsub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) + { + return simd_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_div_nr(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_div_nr_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_selb(simd128_sse_t _mask, simd128_sse_t _a, simd128_sse_t _b) + { + return simd_selb_ni(_mask, _a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_sels(simd128_sse_t _test, simd128_sse_t _a, simd128_sse_t _b) + { + return simd_sels_ni(_test, _a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_not(simd128_sse_t _a) + { + return simd_not_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_abs(simd128_sse_t _a) + { + return simd_abs_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) + { + return simd_clamp_ni(_a, _min, _max); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_lerp(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _s) + { + return simd_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_rsqrt(simd128_sse_t _a) + { + return simd_rsqrt_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_rsqrt_nr(simd128_sse_t _a) + { + return simd_rsqrt_nr_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_rsqrt_carmack(simd128_sse_t _a) + { + return simd_rsqrt_carmack_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_sqrt_nr(simd128_sse_t _a) + { + return simd_sqrt_nr_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_log2(simd128_sse_t _a) + { + return simd_log2_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_exp2(simd128_sse_t _a) + { + return simd_exp2_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_pow(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_pow_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_cross3(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_cross3_ni(_a, _b); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_normalize3(simd128_sse_t _a) + { + return simd_normalize3_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_ceil(simd128_sse_t _a) + { + return simd_ceil_ni(_a); + } + + template<> + BX_SIMD_INLINE simd128_sse_t simd_floor(simd128_sse_t _a) + { + return simd_floor_ni(_a); + } + + typedef simd128_sse_t simd128_t; + +} // namespace bx + +#endif // BX_SIMD_SSE_H_HEADER_GUARD diff --git a/include/bx/simd_ni.inl b/include/bx/simd_ni.inl new file mode 100644 index 0000000..cab1086 --- /dev/null +++ b/include/bx/simd_ni.inl @@ -0,0 +1,558 @@ +/* + * Copyright 2010-2016 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx#license-bsd-2-clause + */ + +#ifndef BX_SIMD_NI_H_HEADER_GUARD +#define BX_SIMD_NI_H_HEADER_GUARD + +namespace bx +{ + template + BX_SIMD_INLINE Ty simd_shuf_xAzC_ni(Ty _a, Ty _b) + { + const Ty xAyB = simd_shuf_xAyB(_a, _b); + const Ty zCwD = simd_shuf_zCwD(_a, _b); + const Ty result = simd_shuf_xyAB(xAyB, zCwD); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_shuf_yBwD_ni(Ty _a, Ty _b) + { + const Ty xAyB = simd_shuf_xAyB(_a, _b); + const Ty zCwD = simd_shuf_zCwD(_a, _b); + const Ty result = simd_shuf_zwCD(xAyB, zCwD); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_madd_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_mul(_a, _b); + const Ty result = simd_add(mul, _c); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_nmsub_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_mul(_a, _b); + const Ty result = simd_sub(_c, mul); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_div_nr_ni(Ty _a, Ty _b) + { + const Ty oneish = simd_isplat(0x3f800001); + const Ty est = simd_rcp_est(_b); + const Ty iter0 = simd_mul(_a, est); + const Ty tmp1 = simd_nmsub(_b, est, oneish); + const Ty result = simd_madd(tmp1, iter0, iter0); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_rcp_ni(Ty _a) + { + const Ty one = simd_splat(1.0f); + const Ty result = simd_div(one, _a); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_orx_ni(Ty _a) + { + const Ty zwxy = simd_swiz_zwxy(_a); + const Ty tmp0 = simd_or(_a, zwxy); + const Ty tmp1 = simd_swiz_yyyy(_a); + const Ty tmp2 = simd_or(tmp0, tmp1); + const Ty mf000 = simd_ild(UINT32_MAX, 0, 0, 0); + const Ty result = simd_and(tmp2, mf000); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_orc_ni(Ty _a, Ty _b) + { + const Ty aorb = simd_or(_a, _b); + const Ty mffff = simd_isplat(UINT32_MAX); + const Ty result = simd_xor(aorb, mffff); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_neg_ni(Ty _a) + { + const Ty zero = simd_zero(); + const Ty result = simd_sub(zero, _a); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_selb_ni(Ty _mask, Ty _a, Ty _b) + { + const Ty sel_a = simd_and(_a, _mask); + const Ty sel_b = simd_andc(_b, _mask); + const Ty result = simd_or(sel_a, sel_b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_sels_ni(Ty _test, Ty _a, Ty _b) + { + const Ty mask = simd_sra(_test, 31); + const Ty result = simd_selb(mask, _a, _b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_not_ni(Ty _a) + { + const Ty mffff = simd_isplat(UINT32_MAX); + const Ty result = simd_xor(_a, mffff); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b) + { + const Ty mask = simd_cmplt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_max_ni(Ty _a, Ty _b) + { + const Ty mask = simd_cmpgt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_abs_ni(Ty _a) + { + const Ty a_neg = simd_neg(_a); + const Ty result = simd_max(a_neg, _a); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_imin_ni(Ty _a, Ty _b) + { + const Ty mask = simd_icmplt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_imax_ni(Ty _a, Ty _b) + { + const Ty mask = simd_icmpgt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_clamp_ni(Ty _a, Ty _min, Ty _max) + { + const Ty tmp = simd_min(_a, _max); + const Ty result = simd_max(tmp, _min); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_lerp_ni(Ty _a, Ty _b, Ty _s) + { + const Ty ba = simd_sub(_b, _a); + const Ty result = simd_madd(_s, ba, _a); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_sqrt_nr_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + const Ty one = simd_splat(1.0f); + const Ty tmp0 = simd_rsqrt_est(_a); + const Ty tmp1 = simd_mul(tmp0, _a); + const Ty tmp2 = simd_mul(tmp1, half); + const Ty tmp3 = simd_nmsub(tmp0, tmp1, one); + const Ty result = simd_madd(tmp3, tmp2, tmp1); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_sqrt_nr1_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + + Ty result = _a; + for (uint32_t ii = 0; ii < 11; ++ii) + { + const Ty tmp1 = simd_div(_a, result); + const Ty tmp2 = simd_add(tmp1, result); + result = simd_mul(tmp2, half); + } + + return result; + } + + template + BX_SIMD_INLINE Ty simd_rsqrt_ni(Ty _a) + { + const Ty one = simd_splat(1.0f); + const Ty sqrt = simd_sqrt(_a); + const Ty result = simd_div(one, sqrt); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_rsqrt_nr_ni(Ty _a) + { + const Ty rsqrt = simd_rsqrt_est(_a); + const Ty iter0 = simd_mul(_a, rsqrt); + const Ty iter1 = simd_mul(iter0, rsqrt); + const Ty half = simd_splat(0.5f); + const Ty half_rsqrt = simd_mul(half, rsqrt); + const Ty three = simd_splat(3.0f); + const Ty three_sub_iter1 = simd_sub(three, iter1); + const Ty result = simd_mul(half_rsqrt, three_sub_iter1); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_rsqrt_carmack_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + const Ty ah = simd_mul(half, _a); + const Ty ashift = simd_sra(_a, 1); + const Ty magic = simd_isplat(0x5f3759df); + const Ty msuba = simd_isub(magic, ashift); + const Ty msubasq = simd_mul(msuba, msuba); + const Ty tmp0 = simd_splat(1.5f); + const Ty tmp1 = simd_mul(ah, msubasq); + const Ty tmp2 = simd_sub(tmp0, tmp1); + const Ty result = simd_mul(msuba, tmp2); + + return result; + } + + namespace simd_logexp_detail + { + template + BX_SIMD_INLINE Ty simd_poly1(Ty _a, float _b, float _c) + { + const Ty bbbb = simd_splat(_b); + const Ty cccc = simd_splat(_c); + const Ty result = simd_madd(cccc, _a, bbbb); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_poly2(Ty _a, float _b, float _c, float _d) + { + const Ty bbbb = simd_splat(_b); + const Ty poly = simd_poly1(_a, _c, _d); + const Ty result = simd_madd(poly, _a, bbbb); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_poly3(Ty _a, float _b, float _c, float _d, float _e) + { + const Ty bbbb = simd_splat(_b); + const Ty poly = simd_poly2(_a, _c, _d, _e); + const Ty result = simd_madd(poly, _a, bbbb); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f) + { + const Ty bbbb = simd_splat(_b); + const Ty poly = simd_poly3(_a, _c, _d, _e, _f); + const Ty result = simd_madd(poly, _a, bbbb); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g) + { + const Ty bbbb = simd_splat(_b); + const Ty poly = simd_poly4(_a, _c, _d, _e, _f, _g); + const Ty result = simd_madd(poly, _a, bbbb); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_logpoly(Ty _a) + { +#if 1 + const Ty result = simd_poly5(_a + , 3.11578814719469302614f, -3.32419399085241980044f + , 2.59883907202499966007f, -1.23152682416275988241f + , 0.318212422185251071475f, -0.0344359067839062357313f + ); +#elif 0 + const Ty result = simd_poly4(_a + , 2.8882704548164776201f, -2.52074962577807006663f + , 1.48116647521213171641f, -0.465725644288844778798f + , 0.0596515482674574969533f + ); +#elif 0 + const Ty result = simd_poly3(_a + , 2.61761038894603480148f, -1.75647175389045657003f + , 0.688243882994381274313f, -0.107254423828329604454f + ); +#else + const Ty result = simd_poly2(_a + , 2.28330284476918490682f, -1.04913055217340124191f + , 0.204446009836232697516f + ); +#endif + + return result; + } + + template + BX_SIMD_INLINE Ty simd_exppoly(Ty _a) + { +#if 1 + const Ty result = simd_poly5(_a + , 9.9999994e-1f, 6.9315308e-1f + , 2.4015361e-1f, 5.5826318e-2f + , 8.9893397e-3f, 1.8775767e-3f + ); +#elif 0 + const Ty result = simd_poly4(_a + , 1.0000026f, 6.9300383e-1f + , 2.4144275e-1f, 5.2011464e-2f + , 1.3534167e-2f + ); +#elif 0 + const Ty result = simd_poly3(_a + , 9.9992520e-1f, 6.9583356e-1f + , 2.2606716e-1f, 7.8024521e-2f + ); +#else + const Ty result = simd_poly2(_a + , 1.0017247f, 6.5763628e-1f + , 3.3718944e-1f + ); +#endif // 0 + + return result; + } + } // namespace simd_internal + + template + BX_SIMD_INLINE Ty simd_log2_ni(Ty _a) + { + const Ty expmask = simd_isplat(0x7f800000); + const Ty mantmask = simd_isplat(0x007fffff); + const Ty one = simd_splat(1.0f); + + const Ty c127 = simd_isplat(127); + const Ty aexp = simd_and(_a, expmask); + const Ty aexpsr = simd_srl(aexp, 23); + const Ty tmp0 = simd_isub(aexpsr, c127); + const Ty exp = simd_itof(tmp0); + + const Ty amask = simd_and(_a, mantmask); + const Ty mant = simd_or(amask, one); + + const Ty poly = simd_logexp_detail::simd_logpoly(mant); + + const Ty mandiff = simd_sub(mant, one); + const Ty result = simd_madd(poly, mandiff, exp); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_exp2_ni(Ty _a) + { + const Ty min = simd_splat( 129.0f); + const Ty max = simd_splat(-126.99999f); + const Ty tmp0 = simd_min(_a, min); + const Ty aaaa = simd_max(tmp0, max); + + const Ty half = simd_splat(0.5f); + const Ty tmp2 = simd_sub(aaaa, half); + const Ty ipart = simd_ftoi(tmp2); + const Ty iround = simd_itof(ipart); + const Ty fpart = simd_sub(aaaa, iround); + + const Ty c127 = simd_isplat(127); + const Ty tmp5 = simd_iadd(ipart, c127); + const Ty expipart = simd_sll(tmp5, 23); + + const Ty expfpart = simd_logexp_detail::simd_exppoly(fpart); + + const Ty result = simd_mul(expipart, expfpart); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_pow_ni(Ty _a, Ty _b) + { + const Ty alog2 = simd_log2(_a); + const Ty alog2b = simd_mul(alog2, _b); + const Ty result = simd_exp2(alog2b); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_dot3_ni(Ty _a, Ty _b) + { + const Ty xyzw = simd_mul(_a, _b); + const Ty xxxx = simd_swiz_xxxx(xyzw); + const Ty yyyy = simd_swiz_yyyy(xyzw); + const Ty zzzz = simd_swiz_zzzz(xyzw); + const Ty tmp1 = simd_add(xxxx, yyyy); + const Ty result = simd_add(zzzz, tmp1); + return result; + } + + template + BX_SIMD_INLINE Ty simd_cross3_ni(Ty _a, Ty _b) + { + // a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx +#if 0 + const Ty a_yzxw = simd_swiz_yzxw(_a); + const Ty a_zxyw = simd_swiz_zxyw(_a); + const Ty b_zxyw = simd_swiz_zxyw(_b); + const Ty b_yzxw = simd_swiz_yzxw(_b); + const Ty tmp = simd_mul(a_yzxw, b_zxyw); + const Ty result = simd_nmsub(a_zxyw, b_yzxw, tmp); +#else + const Ty a_yzxw = simd_swiz_yzxw(_a); + const Ty b_yzxw = simd_swiz_yzxw(_b); + const Ty tmp0 = simd_mul(_a, b_yzxw); + const Ty tmp1 = simd_nmsub(a_yzxw, _b, tmp0); + const Ty result = simd_swiz_yzxw(tmp1); +#endif + + return result; + } + + template + BX_SIMD_INLINE Ty simd_normalize3_ni(Ty _a) + { + const Ty dot3 = simd_dot3(_a, _a); + const Ty invSqrt = simd_rsqrt(dot3); + const Ty result = simd_mul(_a, invSqrt); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_dot_ni(Ty _a, Ty _b) + { + const Ty xyzw = simd_mul(_a, _b); + const Ty yzwx = simd_swiz_yzwx(xyzw); + const Ty tmp0 = simd_add(xyzw, yzwx); + const Ty zwxy = simd_swiz_zwxy(tmp0); + const Ty result = simd_add(tmp0, zwxy); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_ceil_ni(Ty _a) + { + const Ty tmp0 = simd_ftoi(_a); + const Ty tmp1 = simd_itof(tmp0); + const Ty mask = simd_cmplt(tmp1, _a); + const Ty one = simd_splat(1.0f); + const Ty tmp2 = simd_and(one, mask); + const Ty result = simd_add(tmp1, tmp2); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_floor_ni(Ty _a) + { + const Ty tmp0 = simd_ftoi(_a); + const Ty tmp1 = simd_itof(tmp0); + const Ty mask = simd_cmpgt(tmp1, _a); + const Ty one = simd_splat(1.0f); + const Ty tmp2 = simd_and(one, mask); + const Ty result = simd_sub(tmp1, tmp2); + + return result; + } + + template + BX_SIMD_FORCE_INLINE Ty simd_round_ni(Ty _a) + { + const Ty tmp = simd_ftoi(_a); + const Ty result = simd_itof(tmp); + + return result; + } + + template + BX_SIMD_INLINE bool simd_test_any_ni(Ty _a) + { + const Ty mask = simd_sra(_a, 31); + const Ty zwxy = simd_swiz_zwxy(mask); + const Ty tmp0 = simd_or(mask, zwxy); + const Ty tmp1 = simd_swiz_yyyy(tmp0); + const Ty tmp2 = simd_or(tmp0, tmp1); + int res; + simd_stx(&res, tmp2); + return 0 != res; + } + + template + BX_SIMD_INLINE bool simd_test_all_ni(Ty _a) + { + const Ty bits = simd_sra(_a, 31); + const Ty m1248 = simd_ild(1, 2, 4, 8); + const Ty mask = simd_and(bits, m1248); + const Ty zwxy = simd_swiz_zwxy(mask); + const Ty tmp0 = simd_or(mask, zwxy); + const Ty tmp1 = simd_swiz_yyyy(tmp0); + const Ty tmp2 = simd_or(tmp0, tmp1); + int res; + simd_stx(&res, tmp2); + return 0xf == res; + } + +} // namespace bx + +#endif // BX_SIMD_NI_H_HEADER_GUARD diff --git a/include/bx/float4_swizzle.inl b/include/bx/simd_swizzle.inl similarity index 95% rename from include/bx/float4_swizzle.inl rename to include/bx/simd_swizzle.inl index f9429d6..d0c750c 100644 --- a/include/bx/float4_swizzle.inl +++ b/include/bx/simd_swizzle.inl @@ -1,266 +1,266 @@ -/* - * Copyright 2010-2015 Branimir Karadzic. All rights reserved. - * License: http://www.opensource.org/licenses/BSD-2-Clause - */ - -#ifndef BX_FLOAT4_T_H_HEADER_GUARD -# error "xmacro file, must be included from float4_*.h" -#endif // BX_FLOAT4_T_H_HEADER_GUARD - -// included from float4_t.h -IMPLEMENT_SWIZZLE(x, x, x, x) -IMPLEMENT_SWIZZLE(x, x, x, y) -IMPLEMENT_SWIZZLE(x, x, x, z) -IMPLEMENT_SWIZZLE(x, x, x, w) -IMPLEMENT_SWIZZLE(x, x, y, x) -IMPLEMENT_SWIZZLE(x, x, y, y) -IMPLEMENT_SWIZZLE(x, x, y, z) -IMPLEMENT_SWIZZLE(x, x, y, w) -IMPLEMENT_SWIZZLE(x, x, z, x) -IMPLEMENT_SWIZZLE(x, x, z, y) -IMPLEMENT_SWIZZLE(x, x, z, z) -IMPLEMENT_SWIZZLE(x, x, z, w) -IMPLEMENT_SWIZZLE(x, x, w, x) -IMPLEMENT_SWIZZLE(x, x, w, y) -IMPLEMENT_SWIZZLE(x, x, w, z) -IMPLEMENT_SWIZZLE(x, x, w, w) -IMPLEMENT_SWIZZLE(x, y, x, x) -IMPLEMENT_SWIZZLE(x, y, x, y) -IMPLEMENT_SWIZZLE(x, y, x, z) -IMPLEMENT_SWIZZLE(x, y, x, w) -IMPLEMENT_SWIZZLE(x, y, y, x) -IMPLEMENT_SWIZZLE(x, y, y, y) -IMPLEMENT_SWIZZLE(x, y, y, z) -IMPLEMENT_SWIZZLE(x, y, y, w) -IMPLEMENT_SWIZZLE(x, y, z, x) -IMPLEMENT_SWIZZLE(x, y, z, y) -IMPLEMENT_SWIZZLE(x, y, z, z) -// IMPLEMENT_SWIZZLE(x, y, z, w) -IMPLEMENT_SWIZZLE(x, y, w, x) -IMPLEMENT_SWIZZLE(x, y, w, y) -IMPLEMENT_SWIZZLE(x, y, w, z) -IMPLEMENT_SWIZZLE(x, y, w, w) -IMPLEMENT_SWIZZLE(x, z, x, x) -IMPLEMENT_SWIZZLE(x, z, x, y) -IMPLEMENT_SWIZZLE(x, z, x, z) -IMPLEMENT_SWIZZLE(x, z, x, w) -IMPLEMENT_SWIZZLE(x, z, y, x) -IMPLEMENT_SWIZZLE(x, z, y, y) -IMPLEMENT_SWIZZLE(x, z, y, z) -IMPLEMENT_SWIZZLE(x, z, y, w) -IMPLEMENT_SWIZZLE(x, z, z, x) -IMPLEMENT_SWIZZLE(x, z, z, y) -IMPLEMENT_SWIZZLE(x, z, z, z) -IMPLEMENT_SWIZZLE(x, z, z, w) -IMPLEMENT_SWIZZLE(x, z, w, x) -IMPLEMENT_SWIZZLE(x, z, w, y) -IMPLEMENT_SWIZZLE(x, z, w, z) -IMPLEMENT_SWIZZLE(x, z, w, w) -IMPLEMENT_SWIZZLE(x, w, x, x) -IMPLEMENT_SWIZZLE(x, w, x, y) -IMPLEMENT_SWIZZLE(x, w, x, z) -IMPLEMENT_SWIZZLE(x, w, x, w) -IMPLEMENT_SWIZZLE(x, w, y, x) -IMPLEMENT_SWIZZLE(x, w, y, y) -IMPLEMENT_SWIZZLE(x, w, y, z) -IMPLEMENT_SWIZZLE(x, w, y, w) -IMPLEMENT_SWIZZLE(x, w, z, x) -IMPLEMENT_SWIZZLE(x, w, z, y) -IMPLEMENT_SWIZZLE(x, w, z, z) -IMPLEMENT_SWIZZLE(x, w, z, w) -IMPLEMENT_SWIZZLE(x, w, w, x) -IMPLEMENT_SWIZZLE(x, w, w, y) -IMPLEMENT_SWIZZLE(x, w, w, z) -IMPLEMENT_SWIZZLE(x, w, w, w) -IMPLEMENT_SWIZZLE(y, x, x, x) -IMPLEMENT_SWIZZLE(y, x, x, y) -IMPLEMENT_SWIZZLE(y, x, x, z) -IMPLEMENT_SWIZZLE(y, x, x, w) -IMPLEMENT_SWIZZLE(y, x, y, x) -IMPLEMENT_SWIZZLE(y, x, y, y) -IMPLEMENT_SWIZZLE(y, x, y, z) -IMPLEMENT_SWIZZLE(y, x, y, w) -IMPLEMENT_SWIZZLE(y, x, z, x) -IMPLEMENT_SWIZZLE(y, x, z, y) -IMPLEMENT_SWIZZLE(y, x, z, z) -IMPLEMENT_SWIZZLE(y, x, z, w) -IMPLEMENT_SWIZZLE(y, x, w, x) -IMPLEMENT_SWIZZLE(y, x, w, y) -IMPLEMENT_SWIZZLE(y, x, w, z) -IMPLEMENT_SWIZZLE(y, x, w, w) -IMPLEMENT_SWIZZLE(y, y, x, x) -IMPLEMENT_SWIZZLE(y, y, x, y) -IMPLEMENT_SWIZZLE(y, y, x, z) -IMPLEMENT_SWIZZLE(y, y, x, w) -IMPLEMENT_SWIZZLE(y, y, y, x) -IMPLEMENT_SWIZZLE(y, y, y, y) -IMPLEMENT_SWIZZLE(y, y, y, z) -IMPLEMENT_SWIZZLE(y, y, y, w) -IMPLEMENT_SWIZZLE(y, y, z, x) -IMPLEMENT_SWIZZLE(y, y, z, y) -IMPLEMENT_SWIZZLE(y, y, z, z) -IMPLEMENT_SWIZZLE(y, y, z, w) -IMPLEMENT_SWIZZLE(y, y, w, x) -IMPLEMENT_SWIZZLE(y, y, w, y) -IMPLEMENT_SWIZZLE(y, y, w, z) -IMPLEMENT_SWIZZLE(y, y, w, w) -IMPLEMENT_SWIZZLE(y, z, x, x) -IMPLEMENT_SWIZZLE(y, z, x, y) -IMPLEMENT_SWIZZLE(y, z, x, z) -IMPLEMENT_SWIZZLE(y, z, x, w) -IMPLEMENT_SWIZZLE(y, z, y, x) -IMPLEMENT_SWIZZLE(y, z, y, y) -IMPLEMENT_SWIZZLE(y, z, y, z) -IMPLEMENT_SWIZZLE(y, z, y, w) -IMPLEMENT_SWIZZLE(y, z, z, x) -IMPLEMENT_SWIZZLE(y, z, z, y) -IMPLEMENT_SWIZZLE(y, z, z, z) -IMPLEMENT_SWIZZLE(y, z, z, w) -IMPLEMENT_SWIZZLE(y, z, w, x) -IMPLEMENT_SWIZZLE(y, z, w, y) -IMPLEMENT_SWIZZLE(y, z, w, z) -IMPLEMENT_SWIZZLE(y, z, w, w) -IMPLEMENT_SWIZZLE(y, w, x, x) -IMPLEMENT_SWIZZLE(y, w, x, y) -IMPLEMENT_SWIZZLE(y, w, x, z) -IMPLEMENT_SWIZZLE(y, w, x, w) -IMPLEMENT_SWIZZLE(y, w, y, x) -IMPLEMENT_SWIZZLE(y, w, y, y) -IMPLEMENT_SWIZZLE(y, w, y, z) -IMPLEMENT_SWIZZLE(y, w, y, w) -IMPLEMENT_SWIZZLE(y, w, z, x) -IMPLEMENT_SWIZZLE(y, w, z, y) -IMPLEMENT_SWIZZLE(y, w, z, z) -IMPLEMENT_SWIZZLE(y, w, z, w) -IMPLEMENT_SWIZZLE(y, w, w, x) -IMPLEMENT_SWIZZLE(y, w, w, y) -IMPLEMENT_SWIZZLE(y, w, w, z) -IMPLEMENT_SWIZZLE(y, w, w, w) -IMPLEMENT_SWIZZLE(z, x, x, x) -IMPLEMENT_SWIZZLE(z, x, x, y) -IMPLEMENT_SWIZZLE(z, x, x, z) -IMPLEMENT_SWIZZLE(z, x, x, w) -IMPLEMENT_SWIZZLE(z, x, y, x) -IMPLEMENT_SWIZZLE(z, x, y, y) -IMPLEMENT_SWIZZLE(z, x, y, z) -IMPLEMENT_SWIZZLE(z, x, y, w) -IMPLEMENT_SWIZZLE(z, x, z, x) -IMPLEMENT_SWIZZLE(z, x, z, y) -IMPLEMENT_SWIZZLE(z, x, z, z) -IMPLEMENT_SWIZZLE(z, x, z, w) -IMPLEMENT_SWIZZLE(z, x, w, x) -IMPLEMENT_SWIZZLE(z, x, w, y) -IMPLEMENT_SWIZZLE(z, x, w, z) -IMPLEMENT_SWIZZLE(z, x, w, w) -IMPLEMENT_SWIZZLE(z, y, x, x) -IMPLEMENT_SWIZZLE(z, y, x, y) -IMPLEMENT_SWIZZLE(z, y, x, z) -IMPLEMENT_SWIZZLE(z, y, x, w) -IMPLEMENT_SWIZZLE(z, y, y, x) -IMPLEMENT_SWIZZLE(z, y, y, y) -IMPLEMENT_SWIZZLE(z, y, y, z) -IMPLEMENT_SWIZZLE(z, y, y, w) -IMPLEMENT_SWIZZLE(z, y, z, x) -IMPLEMENT_SWIZZLE(z, y, z, y) -IMPLEMENT_SWIZZLE(z, y, z, z) -IMPLEMENT_SWIZZLE(z, y, z, w) -IMPLEMENT_SWIZZLE(z, y, w, x) -IMPLEMENT_SWIZZLE(z, y, w, y) -IMPLEMENT_SWIZZLE(z, y, w, z) -IMPLEMENT_SWIZZLE(z, y, w, w) -IMPLEMENT_SWIZZLE(z, z, x, x) -IMPLEMENT_SWIZZLE(z, z, x, y) -IMPLEMENT_SWIZZLE(z, z, x, z) -IMPLEMENT_SWIZZLE(z, z, x, w) -IMPLEMENT_SWIZZLE(z, z, y, x) -IMPLEMENT_SWIZZLE(z, z, y, y) -IMPLEMENT_SWIZZLE(z, z, y, z) -IMPLEMENT_SWIZZLE(z, z, y, w) -IMPLEMENT_SWIZZLE(z, z, z, x) -IMPLEMENT_SWIZZLE(z, z, z, y) -IMPLEMENT_SWIZZLE(z, z, z, z) -IMPLEMENT_SWIZZLE(z, z, z, w) -IMPLEMENT_SWIZZLE(z, z, w, x) -IMPLEMENT_SWIZZLE(z, z, w, y) -IMPLEMENT_SWIZZLE(z, z, w, z) -IMPLEMENT_SWIZZLE(z, z, w, w) -IMPLEMENT_SWIZZLE(z, w, x, x) -IMPLEMENT_SWIZZLE(z, w, x, y) -IMPLEMENT_SWIZZLE(z, w, x, z) -IMPLEMENT_SWIZZLE(z, w, x, w) -IMPLEMENT_SWIZZLE(z, w, y, x) -IMPLEMENT_SWIZZLE(z, w, y, y) -IMPLEMENT_SWIZZLE(z, w, y, z) -IMPLEMENT_SWIZZLE(z, w, y, w) -IMPLEMENT_SWIZZLE(z, w, z, x) -IMPLEMENT_SWIZZLE(z, w, z, y) -IMPLEMENT_SWIZZLE(z, w, z, z) -IMPLEMENT_SWIZZLE(z, w, z, w) -IMPLEMENT_SWIZZLE(z, w, w, x) -IMPLEMENT_SWIZZLE(z, w, w, y) -IMPLEMENT_SWIZZLE(z, w, w, z) -IMPLEMENT_SWIZZLE(z, w, w, w) -IMPLEMENT_SWIZZLE(w, x, x, x) -IMPLEMENT_SWIZZLE(w, x, x, y) -IMPLEMENT_SWIZZLE(w, x, x, z) -IMPLEMENT_SWIZZLE(w, x, x, w) -IMPLEMENT_SWIZZLE(w, x, y, x) -IMPLEMENT_SWIZZLE(w, x, y, y) -IMPLEMENT_SWIZZLE(w, x, y, z) -IMPLEMENT_SWIZZLE(w, x, y, w) -IMPLEMENT_SWIZZLE(w, x, z, x) -IMPLEMENT_SWIZZLE(w, x, z, y) -IMPLEMENT_SWIZZLE(w, x, z, z) -IMPLEMENT_SWIZZLE(w, x, z, w) -IMPLEMENT_SWIZZLE(w, x, w, x) -IMPLEMENT_SWIZZLE(w, x, w, y) -IMPLEMENT_SWIZZLE(w, x, w, z) -IMPLEMENT_SWIZZLE(w, x, w, w) -IMPLEMENT_SWIZZLE(w, y, x, x) -IMPLEMENT_SWIZZLE(w, y, x, y) -IMPLEMENT_SWIZZLE(w, y, x, z) -IMPLEMENT_SWIZZLE(w, y, x, w) -IMPLEMENT_SWIZZLE(w, y, y, x) -IMPLEMENT_SWIZZLE(w, y, y, y) -IMPLEMENT_SWIZZLE(w, y, y, z) -IMPLEMENT_SWIZZLE(w, y, y, w) -IMPLEMENT_SWIZZLE(w, y, z, x) -IMPLEMENT_SWIZZLE(w, y, z, y) -IMPLEMENT_SWIZZLE(w, y, z, z) -IMPLEMENT_SWIZZLE(w, y, z, w) -IMPLEMENT_SWIZZLE(w, y, w, x) -IMPLEMENT_SWIZZLE(w, y, w, y) -IMPLEMENT_SWIZZLE(w, y, w, z) -IMPLEMENT_SWIZZLE(w, y, w, w) -IMPLEMENT_SWIZZLE(w, z, x, x) -IMPLEMENT_SWIZZLE(w, z, x, y) -IMPLEMENT_SWIZZLE(w, z, x, z) -IMPLEMENT_SWIZZLE(w, z, x, w) -IMPLEMENT_SWIZZLE(w, z, y, x) -IMPLEMENT_SWIZZLE(w, z, y, y) -IMPLEMENT_SWIZZLE(w, z, y, z) -IMPLEMENT_SWIZZLE(w, z, y, w) -IMPLEMENT_SWIZZLE(w, z, z, x) -IMPLEMENT_SWIZZLE(w, z, z, y) -IMPLEMENT_SWIZZLE(w, z, z, z) -IMPLEMENT_SWIZZLE(w, z, z, w) -IMPLEMENT_SWIZZLE(w, z, w, x) -IMPLEMENT_SWIZZLE(w, z, w, y) -IMPLEMENT_SWIZZLE(w, z, w, z) -IMPLEMENT_SWIZZLE(w, z, w, w) -IMPLEMENT_SWIZZLE(w, w, x, x) -IMPLEMENT_SWIZZLE(w, w, x, y) -IMPLEMENT_SWIZZLE(w, w, x, z) -IMPLEMENT_SWIZZLE(w, w, x, w) -IMPLEMENT_SWIZZLE(w, w, y, x) -IMPLEMENT_SWIZZLE(w, w, y, y) -IMPLEMENT_SWIZZLE(w, w, y, z) -IMPLEMENT_SWIZZLE(w, w, y, w) -IMPLEMENT_SWIZZLE(w, w, z, x) -IMPLEMENT_SWIZZLE(w, w, z, y) -IMPLEMENT_SWIZZLE(w, w, z, z) -IMPLEMENT_SWIZZLE(w, w, z, w) -IMPLEMENT_SWIZZLE(w, w, w, x) -IMPLEMENT_SWIZZLE(w, w, w, y) -IMPLEMENT_SWIZZLE(w, w, w, z) -IMPLEMENT_SWIZZLE(w, w, w, w) +/* + * Copyright 2010-2015 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "xmacro file, must be included from simd_*.h" +#endif // BX_FLOAT4_T_H_HEADER_GUARD + +// included from float4_t.h +IMPLEMENT_SWIZZLE(x, x, x, x) +IMPLEMENT_SWIZZLE(x, x, x, y) +IMPLEMENT_SWIZZLE(x, x, x, z) +IMPLEMENT_SWIZZLE(x, x, x, w) +IMPLEMENT_SWIZZLE(x, x, y, x) +IMPLEMENT_SWIZZLE(x, x, y, y) +IMPLEMENT_SWIZZLE(x, x, y, z) +IMPLEMENT_SWIZZLE(x, x, y, w) +IMPLEMENT_SWIZZLE(x, x, z, x) +IMPLEMENT_SWIZZLE(x, x, z, y) +IMPLEMENT_SWIZZLE(x, x, z, z) +IMPLEMENT_SWIZZLE(x, x, z, w) +IMPLEMENT_SWIZZLE(x, x, w, x) +IMPLEMENT_SWIZZLE(x, x, w, y) +IMPLEMENT_SWIZZLE(x, x, w, z) +IMPLEMENT_SWIZZLE(x, x, w, w) +IMPLEMENT_SWIZZLE(x, y, x, x) +IMPLEMENT_SWIZZLE(x, y, x, y) +IMPLEMENT_SWIZZLE(x, y, x, z) +IMPLEMENT_SWIZZLE(x, y, x, w) +IMPLEMENT_SWIZZLE(x, y, y, x) +IMPLEMENT_SWIZZLE(x, y, y, y) +IMPLEMENT_SWIZZLE(x, y, y, z) +IMPLEMENT_SWIZZLE(x, y, y, w) +IMPLEMENT_SWIZZLE(x, y, z, x) +IMPLEMENT_SWIZZLE(x, y, z, y) +IMPLEMENT_SWIZZLE(x, y, z, z) +// IMPLEMENT_SWIZZLE(x, y, z, w) +IMPLEMENT_SWIZZLE(x, y, w, x) +IMPLEMENT_SWIZZLE(x, y, w, y) +IMPLEMENT_SWIZZLE(x, y, w, z) +IMPLEMENT_SWIZZLE(x, y, w, w) +IMPLEMENT_SWIZZLE(x, z, x, x) +IMPLEMENT_SWIZZLE(x, z, x, y) +IMPLEMENT_SWIZZLE(x, z, x, z) +IMPLEMENT_SWIZZLE(x, z, x, w) +IMPLEMENT_SWIZZLE(x, z, y, x) +IMPLEMENT_SWIZZLE(x, z, y, y) +IMPLEMENT_SWIZZLE(x, z, y, z) +IMPLEMENT_SWIZZLE(x, z, y, w) +IMPLEMENT_SWIZZLE(x, z, z, x) +IMPLEMENT_SWIZZLE(x, z, z, y) +IMPLEMENT_SWIZZLE(x, z, z, z) +IMPLEMENT_SWIZZLE(x, z, z, w) +IMPLEMENT_SWIZZLE(x, z, w, x) +IMPLEMENT_SWIZZLE(x, z, w, y) +IMPLEMENT_SWIZZLE(x, z, w, z) +IMPLEMENT_SWIZZLE(x, z, w, w) +IMPLEMENT_SWIZZLE(x, w, x, x) +IMPLEMENT_SWIZZLE(x, w, x, y) +IMPLEMENT_SWIZZLE(x, w, x, z) +IMPLEMENT_SWIZZLE(x, w, x, w) +IMPLEMENT_SWIZZLE(x, w, y, x) +IMPLEMENT_SWIZZLE(x, w, y, y) +IMPLEMENT_SWIZZLE(x, w, y, z) +IMPLEMENT_SWIZZLE(x, w, y, w) +IMPLEMENT_SWIZZLE(x, w, z, x) +IMPLEMENT_SWIZZLE(x, w, z, y) +IMPLEMENT_SWIZZLE(x, w, z, z) +IMPLEMENT_SWIZZLE(x, w, z, w) +IMPLEMENT_SWIZZLE(x, w, w, x) +IMPLEMENT_SWIZZLE(x, w, w, y) +IMPLEMENT_SWIZZLE(x, w, w, z) +IMPLEMENT_SWIZZLE(x, w, w, w) +IMPLEMENT_SWIZZLE(y, x, x, x) +IMPLEMENT_SWIZZLE(y, x, x, y) +IMPLEMENT_SWIZZLE(y, x, x, z) +IMPLEMENT_SWIZZLE(y, x, x, w) +IMPLEMENT_SWIZZLE(y, x, y, x) +IMPLEMENT_SWIZZLE(y, x, y, y) +IMPLEMENT_SWIZZLE(y, x, y, z) +IMPLEMENT_SWIZZLE(y, x, y, w) +IMPLEMENT_SWIZZLE(y, x, z, x) +IMPLEMENT_SWIZZLE(y, x, z, y) +IMPLEMENT_SWIZZLE(y, x, z, z) +IMPLEMENT_SWIZZLE(y, x, z, w) +IMPLEMENT_SWIZZLE(y, x, w, x) +IMPLEMENT_SWIZZLE(y, x, w, y) +IMPLEMENT_SWIZZLE(y, x, w, z) +IMPLEMENT_SWIZZLE(y, x, w, w) +IMPLEMENT_SWIZZLE(y, y, x, x) +IMPLEMENT_SWIZZLE(y, y, x, y) +IMPLEMENT_SWIZZLE(y, y, x, z) +IMPLEMENT_SWIZZLE(y, y, x, w) +IMPLEMENT_SWIZZLE(y, y, y, x) +IMPLEMENT_SWIZZLE(y, y, y, y) +IMPLEMENT_SWIZZLE(y, y, y, z) +IMPLEMENT_SWIZZLE(y, y, y, w) +IMPLEMENT_SWIZZLE(y, y, z, x) +IMPLEMENT_SWIZZLE(y, y, z, y) +IMPLEMENT_SWIZZLE(y, y, z, z) +IMPLEMENT_SWIZZLE(y, y, z, w) +IMPLEMENT_SWIZZLE(y, y, w, x) +IMPLEMENT_SWIZZLE(y, y, w, y) +IMPLEMENT_SWIZZLE(y, y, w, z) +IMPLEMENT_SWIZZLE(y, y, w, w) +IMPLEMENT_SWIZZLE(y, z, x, x) +IMPLEMENT_SWIZZLE(y, z, x, y) +IMPLEMENT_SWIZZLE(y, z, x, z) +IMPLEMENT_SWIZZLE(y, z, x, w) +IMPLEMENT_SWIZZLE(y, z, y, x) +IMPLEMENT_SWIZZLE(y, z, y, y) +IMPLEMENT_SWIZZLE(y, z, y, z) +IMPLEMENT_SWIZZLE(y, z, y, w) +IMPLEMENT_SWIZZLE(y, z, z, x) +IMPLEMENT_SWIZZLE(y, z, z, y) +IMPLEMENT_SWIZZLE(y, z, z, z) +IMPLEMENT_SWIZZLE(y, z, z, w) +IMPLEMENT_SWIZZLE(y, z, w, x) +IMPLEMENT_SWIZZLE(y, z, w, y) +IMPLEMENT_SWIZZLE(y, z, w, z) +IMPLEMENT_SWIZZLE(y, z, w, w) +IMPLEMENT_SWIZZLE(y, w, x, x) +IMPLEMENT_SWIZZLE(y, w, x, y) +IMPLEMENT_SWIZZLE(y, w, x, z) +IMPLEMENT_SWIZZLE(y, w, x, w) +IMPLEMENT_SWIZZLE(y, w, y, x) +IMPLEMENT_SWIZZLE(y, w, y, y) +IMPLEMENT_SWIZZLE(y, w, y, z) +IMPLEMENT_SWIZZLE(y, w, y, w) +IMPLEMENT_SWIZZLE(y, w, z, x) +IMPLEMENT_SWIZZLE(y, w, z, y) +IMPLEMENT_SWIZZLE(y, w, z, z) +IMPLEMENT_SWIZZLE(y, w, z, w) +IMPLEMENT_SWIZZLE(y, w, w, x) +IMPLEMENT_SWIZZLE(y, w, w, y) +IMPLEMENT_SWIZZLE(y, w, w, z) +IMPLEMENT_SWIZZLE(y, w, w, w) +IMPLEMENT_SWIZZLE(z, x, x, x) +IMPLEMENT_SWIZZLE(z, x, x, y) +IMPLEMENT_SWIZZLE(z, x, x, z) +IMPLEMENT_SWIZZLE(z, x, x, w) +IMPLEMENT_SWIZZLE(z, x, y, x) +IMPLEMENT_SWIZZLE(z, x, y, y) +IMPLEMENT_SWIZZLE(z, x, y, z) +IMPLEMENT_SWIZZLE(z, x, y, w) +IMPLEMENT_SWIZZLE(z, x, z, x) +IMPLEMENT_SWIZZLE(z, x, z, y) +IMPLEMENT_SWIZZLE(z, x, z, z) +IMPLEMENT_SWIZZLE(z, x, z, w) +IMPLEMENT_SWIZZLE(z, x, w, x) +IMPLEMENT_SWIZZLE(z, x, w, y) +IMPLEMENT_SWIZZLE(z, x, w, z) +IMPLEMENT_SWIZZLE(z, x, w, w) +IMPLEMENT_SWIZZLE(z, y, x, x) +IMPLEMENT_SWIZZLE(z, y, x, y) +IMPLEMENT_SWIZZLE(z, y, x, z) +IMPLEMENT_SWIZZLE(z, y, x, w) +IMPLEMENT_SWIZZLE(z, y, y, x) +IMPLEMENT_SWIZZLE(z, y, y, y) +IMPLEMENT_SWIZZLE(z, y, y, z) +IMPLEMENT_SWIZZLE(z, y, y, w) +IMPLEMENT_SWIZZLE(z, y, z, x) +IMPLEMENT_SWIZZLE(z, y, z, y) +IMPLEMENT_SWIZZLE(z, y, z, z) +IMPLEMENT_SWIZZLE(z, y, z, w) +IMPLEMENT_SWIZZLE(z, y, w, x) +IMPLEMENT_SWIZZLE(z, y, w, y) +IMPLEMENT_SWIZZLE(z, y, w, z) +IMPLEMENT_SWIZZLE(z, y, w, w) +IMPLEMENT_SWIZZLE(z, z, x, x) +IMPLEMENT_SWIZZLE(z, z, x, y) +IMPLEMENT_SWIZZLE(z, z, x, z) +IMPLEMENT_SWIZZLE(z, z, x, w) +IMPLEMENT_SWIZZLE(z, z, y, x) +IMPLEMENT_SWIZZLE(z, z, y, y) +IMPLEMENT_SWIZZLE(z, z, y, z) +IMPLEMENT_SWIZZLE(z, z, y, w) +IMPLEMENT_SWIZZLE(z, z, z, x) +IMPLEMENT_SWIZZLE(z, z, z, y) +IMPLEMENT_SWIZZLE(z, z, z, z) +IMPLEMENT_SWIZZLE(z, z, z, w) +IMPLEMENT_SWIZZLE(z, z, w, x) +IMPLEMENT_SWIZZLE(z, z, w, y) +IMPLEMENT_SWIZZLE(z, z, w, z) +IMPLEMENT_SWIZZLE(z, z, w, w) +IMPLEMENT_SWIZZLE(z, w, x, x) +IMPLEMENT_SWIZZLE(z, w, x, y) +IMPLEMENT_SWIZZLE(z, w, x, z) +IMPLEMENT_SWIZZLE(z, w, x, w) +IMPLEMENT_SWIZZLE(z, w, y, x) +IMPLEMENT_SWIZZLE(z, w, y, y) +IMPLEMENT_SWIZZLE(z, w, y, z) +IMPLEMENT_SWIZZLE(z, w, y, w) +IMPLEMENT_SWIZZLE(z, w, z, x) +IMPLEMENT_SWIZZLE(z, w, z, y) +IMPLEMENT_SWIZZLE(z, w, z, z) +IMPLEMENT_SWIZZLE(z, w, z, w) +IMPLEMENT_SWIZZLE(z, w, w, x) +IMPLEMENT_SWIZZLE(z, w, w, y) +IMPLEMENT_SWIZZLE(z, w, w, z) +IMPLEMENT_SWIZZLE(z, w, w, w) +IMPLEMENT_SWIZZLE(w, x, x, x) +IMPLEMENT_SWIZZLE(w, x, x, y) +IMPLEMENT_SWIZZLE(w, x, x, z) +IMPLEMENT_SWIZZLE(w, x, x, w) +IMPLEMENT_SWIZZLE(w, x, y, x) +IMPLEMENT_SWIZZLE(w, x, y, y) +IMPLEMENT_SWIZZLE(w, x, y, z) +IMPLEMENT_SWIZZLE(w, x, y, w) +IMPLEMENT_SWIZZLE(w, x, z, x) +IMPLEMENT_SWIZZLE(w, x, z, y) +IMPLEMENT_SWIZZLE(w, x, z, z) +IMPLEMENT_SWIZZLE(w, x, z, w) +IMPLEMENT_SWIZZLE(w, x, w, x) +IMPLEMENT_SWIZZLE(w, x, w, y) +IMPLEMENT_SWIZZLE(w, x, w, z) +IMPLEMENT_SWIZZLE(w, x, w, w) +IMPLEMENT_SWIZZLE(w, y, x, x) +IMPLEMENT_SWIZZLE(w, y, x, y) +IMPLEMENT_SWIZZLE(w, y, x, z) +IMPLEMENT_SWIZZLE(w, y, x, w) +IMPLEMENT_SWIZZLE(w, y, y, x) +IMPLEMENT_SWIZZLE(w, y, y, y) +IMPLEMENT_SWIZZLE(w, y, y, z) +IMPLEMENT_SWIZZLE(w, y, y, w) +IMPLEMENT_SWIZZLE(w, y, z, x) +IMPLEMENT_SWIZZLE(w, y, z, y) +IMPLEMENT_SWIZZLE(w, y, z, z) +IMPLEMENT_SWIZZLE(w, y, z, w) +IMPLEMENT_SWIZZLE(w, y, w, x) +IMPLEMENT_SWIZZLE(w, y, w, y) +IMPLEMENT_SWIZZLE(w, y, w, z) +IMPLEMENT_SWIZZLE(w, y, w, w) +IMPLEMENT_SWIZZLE(w, z, x, x) +IMPLEMENT_SWIZZLE(w, z, x, y) +IMPLEMENT_SWIZZLE(w, z, x, z) +IMPLEMENT_SWIZZLE(w, z, x, w) +IMPLEMENT_SWIZZLE(w, z, y, x) +IMPLEMENT_SWIZZLE(w, z, y, y) +IMPLEMENT_SWIZZLE(w, z, y, z) +IMPLEMENT_SWIZZLE(w, z, y, w) +IMPLEMENT_SWIZZLE(w, z, z, x) +IMPLEMENT_SWIZZLE(w, z, z, y) +IMPLEMENT_SWIZZLE(w, z, z, z) +IMPLEMENT_SWIZZLE(w, z, z, w) +IMPLEMENT_SWIZZLE(w, z, w, x) +IMPLEMENT_SWIZZLE(w, z, w, y) +IMPLEMENT_SWIZZLE(w, z, w, z) +IMPLEMENT_SWIZZLE(w, z, w, w) +IMPLEMENT_SWIZZLE(w, w, x, x) +IMPLEMENT_SWIZZLE(w, w, x, y) +IMPLEMENT_SWIZZLE(w, w, x, z) +IMPLEMENT_SWIZZLE(w, w, x, w) +IMPLEMENT_SWIZZLE(w, w, y, x) +IMPLEMENT_SWIZZLE(w, w, y, y) +IMPLEMENT_SWIZZLE(w, w, y, z) +IMPLEMENT_SWIZZLE(w, w, y, w) +IMPLEMENT_SWIZZLE(w, w, z, x) +IMPLEMENT_SWIZZLE(w, w, z, y) +IMPLEMENT_SWIZZLE(w, w, z, z) +IMPLEMENT_SWIZZLE(w, w, z, w) +IMPLEMENT_SWIZZLE(w, w, w, x) +IMPLEMENT_SWIZZLE(w, w, w, y) +IMPLEMENT_SWIZZLE(w, w, w, z) +IMPLEMENT_SWIZZLE(w, w, w, w) diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h new file mode 100644 index 0000000..e9fa95d --- /dev/null +++ b/include/bx/simd_t.h @@ -0,0 +1,436 @@ +/* + * Copyright 2010-2016 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx#license-bsd-2-clause + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +#define BX_SIMD_T_H_HEADER_GUARD + +#include "bx.h" + +#define BX_SIMD_FORCE_INLINE BX_FORCE_INLINE +#define BX_SIMD_INLINE inline + +#define BX_SIMD_SSE 0 +#define BX_SIMD_AVX 0 +#define BX_SIMD_NEON 0 +#define BX_SIMD_LANGEXT 0 + +#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) ) +# include // __m128i +# if defined(__SSE4_1__) +# include +# endif // defined(__SSE4_1__) +# include // __m128 +# undef BX_SIMD_SSE +# define BX_SIMD_SSE 1 + +namespace bx +{ + typedef __m128 simd128_sse_t; + +} // namespace bx + +#elif defined(__ARM_NEON__) && !BX_COMPILER_CLANG +# include +# undef BX_SIMD_NEON +# define BX_SIMD_NEON 1 + +namespace bx +{ + typedef float32x4_t simd128_neon_t; + +} // namespace bx + +#elif BX_COMPILER_CLANG \ + && !BX_PLATFORM_EMSCRIPTEN \ + && !BX_PLATFORM_IOS \ + && BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) +# include +# undef BX_SIMD_LANGEXT +# define BX_SIMD_LANGEXT 1 + +namespace bx +{ + union simd128_langext_t + { + float __attribute__((vector_size(16))) vf; + int32_t __attribute__((vector_size(16))) vi; + uint32_t __attribute__((vector_size(16))) vu; + float fxyzw[4]; + int32_t ixyzw[4]; + uint32_t uxyzw[4]; + + }; +} // namespace bx +#endif // + +namespace bx +{ + union simd128_ref_t + { + float fxyzw[4]; + int32_t ixyzw[4]; + uint32_t uxyzw[4]; + + }; +} // namespace bx + +namespace bx +{ +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template \ + BX_SIMD_FORCE_INLINE Ty simd_swiz_##_x##_y##_z##_w(Ty _a); +#include "simd_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw) \ + template \ + BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(Ty _test); \ + \ + template \ + BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(Ty _test) + +IMPLEMENT_TEST(x ); +IMPLEMENT_TEST(y ); +IMPLEMENT_TEST(xy ); +IMPLEMENT_TEST(z ); +IMPLEMENT_TEST(xz ); +IMPLEMENT_TEST(yz ); +IMPLEMENT_TEST(xyz ); +IMPLEMENT_TEST(w ); +IMPLEMENT_TEST(xw ); +IMPLEMENT_TEST(yw ); +IMPLEMENT_TEST(xyw ); +IMPLEMENT_TEST(zw ); +IMPLEMENT_TEST(xzw ); +IMPLEMENT_TEST(yzw ); +IMPLEMENT_TEST(xyzw); +#undef IMPLEMENT_TEST + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_xyAB(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_ABxy(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_CDzw(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_zwCD(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_xAyB(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_yBxA(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_zCwD(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_shuf_CzDw(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE float simd_x(Ty _a); + + template + BX_SIMD_FORCE_INLINE float simd_y(Ty _a); + + template + BX_SIMD_FORCE_INLINE float simd_z(Ty _a); + + template + BX_SIMD_FORCE_INLINE float simd_w(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_ld(const void* _ptr); + + template + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, Ty _a); + + template + BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, Ty _a); + + template + BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_ld(float _x, float _y, float _z, float _w); + + template + BX_SIMD_FORCE_INLINE Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); + + template + BX_SIMD_FORCE_INLINE Ty simd_splat(const void* _ptr); + + template + BX_SIMD_FORCE_INLINE Ty simd_splat(float _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_isplat(uint32_t _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_zero(); + + template + BX_SIMD_FORCE_INLINE Ty simd_itof(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_ftoi(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_round(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_add(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_sub(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_mul(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_div(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_rcp_est(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_sqrt(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_rsqrt_est(Ty _a); + + template + BX_SIMD_FORCE_INLINE Ty simd_dot3(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_dot(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_cmpeq(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_cmplt(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_cmple(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_cmpgt(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_cmpge(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_min(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_max(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_and(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_andc(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_or(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_xor(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_sll(Ty _a, int _count); + + template + BX_SIMD_FORCE_INLINE Ty simd_srl(Ty _a, int _count); + + template + BX_SIMD_FORCE_INLINE Ty simd_sra(Ty _a, int _count); + + template + BX_SIMD_FORCE_INLINE Ty simd_icmpeq(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_icmplt(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_icmpgt(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_imin(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_imax(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_iadd(Ty _a, Ty _b); + + template + BX_SIMD_FORCE_INLINE Ty simd_isub(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_shuf_xAzC(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_shuf_yBwD(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_rcp(Ty _a); + + template + BX_SIMD_INLINE Ty simd_orx(Ty _a); + + template + BX_SIMD_INLINE Ty simd_orc(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_neg(Ty _a); + + template + BX_SIMD_INLINE Ty simd_madd(Ty _a, Ty _b, Ty _c); + + template + BX_SIMD_INLINE Ty simd_nmsub(Ty _a, Ty _b, Ty _c); + + template + BX_SIMD_INLINE Ty simd_div_nr(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_selb(Ty _mask, Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_sels(Ty _test, Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_not(Ty _a); + + template + BX_SIMD_INLINE Ty simd_abs(Ty _a); + + template + BX_SIMD_INLINE Ty simd_clamp(Ty _a, Ty _min, Ty _max); + + template + BX_SIMD_INLINE Ty simd_lerp(Ty _a, Ty _b, Ty _s); + + template + BX_SIMD_INLINE Ty simd_rsqrt(Ty _a); + + template + BX_SIMD_INLINE Ty simd_rsqrt_nr(Ty _a); + + template + BX_SIMD_INLINE Ty simd_rsqrt_carmack(Ty _a); + + template + BX_SIMD_INLINE Ty simd_sqrt_nr(Ty _a); + + template + BX_SIMD_INLINE Ty simd_log2(Ty _a); + + template + BX_SIMD_INLINE Ty simd_exp2(Ty _a); + + template + BX_SIMD_INLINE Ty simd_pow(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_cross3(Ty _a, Ty _b); + + template + BX_SIMD_INLINE Ty simd_normalize3(Ty _a); + + template + BX_SIMD_INLINE Ty simd_ceil(Ty _a); + + template + BX_SIMD_INLINE Ty simd_floor(Ty _a); + +} // namespace bx + +#if BX_SIMD_SSE +# include "simd128_sse.inl" +#endif // BX_SIMD_SSE + +#if BX_SIMD_NEON +# include "simd128_neon.inl" +#endif // BX_SIMD_NEON + +#if BX_SIMD_LANGEXT +# include "simd128_langext.inl" +#endif // BX_SIMD_LANGEXT + +#if !( BX_SIMD_SSE \ + || BX_SIMD_AVX \ + || BX_SIMD_NEON \ + || BX_SIMD_LANGEXT \ + ) +# ifndef BX_SIMD_WARN_REFERENCE_IMPL +# define BX_SIMD_WARN_REFERENCE_IMPL 0 +# endif // BX_SIMD_WARN_REFERENCE_IMPL + +# if BX_SIMD_WARN_REFERENCE_IMPL +# pragma message("************************************\nUsing SIMD reference implementation!\n************************************") +# endif // BX_SIMD_WARN_REFERENCE_IMPL + +namespace bx +{ + typedef simd128_ref_t simd128_t; +} +#endif // + +#include "simd128_ref.inl" + +namespace bx +{ + BX_SIMD_FORCE_INLINE simd128_t simd_zero() + { + return simd_zero(); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr) + { + return simd_ld(_ptr); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w) + { + return simd_ld(_x, _y, _z, _w); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + return simd_ild(_x, _y, _z, _w); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr) + { + return simd_splat(_ptr); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a) + { + return simd_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a) + { + return simd_isplat(_a); + } +} + +#endif // BX_SIMD_T_H_HEADER_GUARD diff --git a/tests/float4_t.cpp b/tests/float4_t.cpp deleted file mode 100644 index 3bdfb19..0000000 --- a/tests/float4_t.cpp +++ /dev/null @@ -1,309 +0,0 @@ -/* - * Copyright 2010-2016 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx#license-bsd-2-clause - */ - -#include "test.h" -#include -#include -#include - -using namespace bx; - -union float4_cast -{ - bx::float4_t f4; - float f[4]; - uint32_t ui[4]; - int32_t i[4]; - char c[16]; -}; - -void float4_check_bool(const char* _str, bool _a, bool _0) -{ - DBG("%s %d == %d" - , _str - , _a - , _0 - ); - - CHECK_EQUAL(_a, _0); -} - -void float4_check_int32(const char* _str, bx::float4_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3) -{ - float4_cast c; c.f4 = _a; - DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)" - , _str - , c.i[0], c.i[1], c.i[2], c.i[3] - , _0, _1, _2, _3 - ); - - CHECK_EQUAL(c.i[0], _0); - CHECK_EQUAL(c.i[1], _1); - CHECK_EQUAL(c.i[2], _2); - CHECK_EQUAL(c.i[3], _3); -} - -void float4_check_uint32(const char* _str, bx::float4_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3) -{ - float4_cast c; c.f4 = _a; - - DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)" - , _str - , c.ui[0], c.ui[1], c.ui[2], c.ui[3] - , _0, _1, _2, _3 - ); - - CHECK_EQUAL(c.ui[0], _0); - CHECK_EQUAL(c.ui[1], _1); - CHECK_EQUAL(c.ui[2], _2); - CHECK_EQUAL(c.ui[3], _3); -} - -void float4_check_float(const char* _str, bx::float4_t _a, float _0, float _1, float _2, float _3) -{ - float4_cast c; c.f4 = _a; - - DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)" - , _str - , c.f[0], c.f[1], c.f[2], c.f[3] - , _0, _1, _2, _3 - ); - - CHECK(bx::fequal(c.f[0], _0, 0.0001f) ); - CHECK(bx::fequal(c.f[1], _1, 0.0001f) ); - CHECK(bx::fequal(c.f[2], _2, 0.0001f) ); - CHECK(bx::fequal(c.f[3], _3, 0.0001f) ); -} - -void float4_check_string(const char* _str, bx::float4_t _a) -{ - float4_cast c; c.f4 = _a; - const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' }; - - DBG("%s %s", _str, test); - - CHECK(0 == strcmp(_str, test) ); -} - -TEST(float4_swizzle) -{ - const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); - -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - float4_check_string("" #_x #_y #_z #_w "", float4_swiz_##_x##_y##_z##_w(xyzw) ); \ - -#include - -#undef IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx -} - -TEST(float4_shuffle) -{ - const float4_t xyzw = float4_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); - const float4_t ABCD = float4_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444); - float4_check_string("xyAB", float4_shuf_xyAB(xyzw, ABCD) ); - float4_check_string("ABxy", float4_shuf_ABxy(xyzw, ABCD) ); - float4_check_string("zwCD", float4_shuf_zwCD(xyzw, ABCD) ); - float4_check_string("CDzw", float4_shuf_CDzw(xyzw, ABCD) ); - float4_check_string("xAyB", float4_shuf_xAyB(xyzw, ABCD) ); - float4_check_string("zCwD", float4_shuf_zCwD(xyzw, ABCD) ); - float4_check_string("xAzC", float4_shuf_xAzC(xyzw, ABCD) ); - float4_check_string("yBwD", float4_shuf_yBwD(xyzw, ABCD) ); - float4_check_string("CzDw", float4_shuf_CzDw(xyzw, ABCD) ); -} - -TEST(float4_compare) -{ - float4_check_uint32("cmpeq" - , float4_cmpeq(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) ) - , 0, 0xffffffff, 0, 0 - ); - - float4_check_uint32("cmplt" - , float4_cmplt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) ) - , 0, 0, 0, 0 - ); - - float4_check_uint32("cmple" - , float4_cmple(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) ) - , 0, 0xffffffff, 0, 0 - ); - - float4_check_uint32("cmpgt" - , float4_cmpgt(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) ) - , 0xffffffff, 0, 0xffffffff, 0xffffffff - ); - - float4_check_uint32("cmpge" - , float4_cmpge(float4_ld(1.0f, 2.0f, 3.0f, 4.0f), float4_ld(0.0f, 2.0f, 0.0f, 3.0f) ) - , 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff - ); - - float4_check_uint32("icmpeq" - , float4_icmpeq(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) ) - , 0xffffffff, 0, 0, 0xffffffff - ); - - float4_check_uint32("icmplt" - , float4_icmplt(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) ) - , 0, 0, 0, 0 - ); - - float4_check_uint32("icmpgt" - , float4_icmpgt(float4_ild(0, 1, 2, 3), float4_ild(0, uint32_t(-2), 1, 3) ) - , 0, 0xffffffff, 0xffffffff, 0 - ); -} - -TEST(float4_test) -{ - float4_check_bool("test_any_xyzw" - , float4_test_any_xyzw(float4_ild(0xffffffff, 0, 0, 0) ) - , true - ); - - float4_check_bool("test_all_xyzw" - , float4_test_all_xyzw(float4_ild(0xffffffff, 0, 0xffffffff, 0) ) - , false - ); - - float4_check_bool("test_all_xyzw" - , float4_test_all_xyzw(float4_ild(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) ) - , true - ); - - float4_check_bool("test_all_xw" - , float4_test_all_xw(float4_ild(0xffffffff, 0, 0, 0xffffffff) ) - , true - ); - - float4_check_bool("test_all_xzw" - , float4_test_all_xzw(float4_ild(0xffffffff, 0, 0, 0xffffffff) ) - , false - ); -} - -TEST(float4_load) -{ - float4_check_float("ld" - , float4_ld(0.0f, 1.0f, 2.0f, 3.0f) - , 0.0f, 1.0f, 2.0f, 3.0f - ); - - float4_check_int32("ild" - , float4_ild(uint32_t(-1), 0, 1, 2) - , uint32_t(-1), 0, 1, 2 - ); - - float4_check_int32("ild" - , float4_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) ) - , uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) - ); - - float4_check_uint32("zero", float4_zero() - , 0, 0, 0, 0 - ); - - float4_check_uint32("isplat", float4_isplat(0x80000001) - , 0x80000001, 0x80000001, 0x80000001, 0x80000001 - ); - - float4_check_float("isplat", float4_splat(1.0f) - , 1.0f, 1.0f, 1.0f, 1.0f - ); -} - -TEST(float4_arithmetic) -{ - float4_check_float("madd" - , float4_madd(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(4.0f, 5.0f, 6.0f, 7.0f), float4_ld(8.0f, 9.0f, 10.0f, 11.0f) ) - , 8.0f, 14.0f, 22.0f, 32.0f - ); - - float4_check_float("cross3" - , float4_cross3(float4_ld(1.0f, 0.0f, 0.0f, 0.0f), float4_ld(0.0f, 1.0f, 0.0f, 0.0f) ) - , 0.0f, 0.0f, 1.0f, 0.0f - ); -} - -TEST(float4_sqrt) -{ - float4_check_float("float4_sqrt" - , float4_sqrt(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) - , 1.0f, 4.0f, 256.0f, 351.363060096f - ); - - float4_check_float("float4_sqrt_nr_ni" - , float4_sqrt_nr_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) - , 1.0f, 4.0f, 256.0f, 351.363060096f - ); - - float4_check_float("float4_sqrt_nr1_ni" - , float4_sqrt_nr1_ni(float4_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) - , 1.0f, 4.0f, 256.0f, 351.363060096f - ); -} - -TEST(float4) -{ - const float4_t isplat = float4_isplat(0x80000001); - float4_check_uint32("sll" - , float4_sll(isplat, 1) - , 0x00000002, 0x00000002, 0x00000002, 0x00000002 - ); - - float4_check_uint32("srl" - , float4_srl(isplat, 1) - , 0x40000000, 0x40000000, 0x40000000, 0x40000000 - ); - - float4_check_uint32("sra" - , float4_sra(isplat, 1) - , 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000 - ); - - float4_check_uint32("and" - , float4_and(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) ) - , 0, 0, 0, 0 - ); - - float4_check_uint32("or " - , float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) ) - , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) - ); - - float4_check_uint32("xor" - , float4_or(float4_isplat(0x55555555), float4_isplat(0xaaaaaaaa) ) - , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) - ); - - float4_check_int32("imin" - , float4_imin(float4_ild(0, 1, 2, 3), float4_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) - , uint32_t(-1), 1, uint32_t(-2), 1 - ); - - float4_check_float("min" - , float4_min(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) - , -1.0f, 1.0f, -2.0f, 1.0f - ); - - float4_check_int32("imax" - , float4_imax(float4_ild(0, 1, 2, 3), float4_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) - , 0, 2, 2, 3 - ); - - float4_check_float("max" - , float4_max(float4_ld(0.0f, 1.0f, 2.0f, 3.0f), float4_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) - , 0.0f, 2.0f, 2.0f, 3.0f - ); -} diff --git a/tests/simd_t.cpp b/tests/simd_t.cpp new file mode 100644 index 0000000..e8f2ef0 --- /dev/null +++ b/tests/simd_t.cpp @@ -0,0 +1,309 @@ +/* + * Copyright 2010-2016 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx#license-bsd-2-clause + */ + +#include "test.h" +#include +#include +#include + +using namespace bx; + +union simd_cast +{ + bx::simd128_t f4; + float f[4]; + uint32_t ui[4]; + int32_t i[4]; + char c[16]; +}; + +void simd_check_bool(const char* _str, bool _a, bool _0) +{ + DBG("%s %d == %d" + , _str + , _a + , _0 + ); + + CHECK_EQUAL(_a, _0); +} + +void simd_check_int32(const char* _str, bx::simd128_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3) +{ + simd_cast c; c.f4 = _a; + DBG("%s (%d, %d, %d, %d) == (%d, %d, %d, %d)" + , _str + , c.i[0], c.i[1], c.i[2], c.i[3] + , _0, _1, _2, _3 + ); + + CHECK_EQUAL(c.i[0], _0); + CHECK_EQUAL(c.i[1], _1); + CHECK_EQUAL(c.i[2], _2); + CHECK_EQUAL(c.i[3], _3); +} + +void simd_check_uint32(const char* _str, bx::simd128_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3) +{ + simd_cast c; c.f4 = _a; + + DBG("%s (0x%08x, 0x%08x, 0x%08x, 0x%08x) == (0x%08x, 0x%08x, 0x%08x, 0x%08x)" + , _str + , c.ui[0], c.ui[1], c.ui[2], c.ui[3] + , _0, _1, _2, _3 + ); + + CHECK_EQUAL(c.ui[0], _0); + CHECK_EQUAL(c.ui[1], _1); + CHECK_EQUAL(c.ui[2], _2); + CHECK_EQUAL(c.ui[3], _3); +} + +void simd_check_float(const char* _str, bx::simd128_t _a, float _0, float _1, float _2, float _3) +{ + simd_cast c; c.f4 = _a; + + DBG("%s (%f, %f, %f, %f) == (%f, %f, %f, %f)" + , _str + , c.f[0], c.f[1], c.f[2], c.f[3] + , _0, _1, _2, _3 + ); + + CHECK(bx::fequal(c.f[0], _0, 0.0001f) ); + CHECK(bx::fequal(c.f[1], _1, 0.0001f) ); + CHECK(bx::fequal(c.f[2], _2, 0.0001f) ); + CHECK(bx::fequal(c.f[3], _3, 0.0001f) ); +} + +void simd_check_string(const char* _str, bx::simd128_t _a) +{ + simd_cast c; c.f4 = _a; + const char test[5] = { c.c[0], c.c[4], c.c[8], c.c[12], '\0' }; + + DBG("%s %s", _str, test); + + CHECK(0 == strcmp(_str, test) ); +} + +TEST(simd_swizzle) +{ + const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + simd_check_string("" #_x #_y #_z #_w "", simd_swiz_##_x##_y##_z##_w(xyzw) ); \ + +#include + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx +} + +TEST(simd_shuffle) +{ + const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); + const simd128_t ABCD = simd_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444); + simd_check_string("xyAB", simd_shuf_xyAB(xyzw, ABCD) ); + simd_check_string("ABxy", simd_shuf_ABxy(xyzw, ABCD) ); + simd_check_string("zwCD", simd_shuf_zwCD(xyzw, ABCD) ); + simd_check_string("CDzw", simd_shuf_CDzw(xyzw, ABCD) ); + simd_check_string("xAyB", simd_shuf_xAyB(xyzw, ABCD) ); + simd_check_string("zCwD", simd_shuf_zCwD(xyzw, ABCD) ); + simd_check_string("xAzC", simd_shuf_xAzC(xyzw, ABCD) ); + simd_check_string("yBwD", simd_shuf_yBwD(xyzw, ABCD) ); + simd_check_string("CzDw", simd_shuf_CzDw(xyzw, ABCD) ); +} + +TEST(simd_compare) +{ + simd_check_uint32("cmpeq" + , simd_cmpeq(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , 0, 0xffffffff, 0, 0 + ); + + simd_check_uint32("cmplt" + , simd_cmplt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , 0, 0, 0, 0 + ); + + simd_check_uint32("cmple" + , simd_cmple(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , 0, 0xffffffff, 0, 0 + ); + + simd_check_uint32("cmpgt" + , simd_cmpgt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , 0xffffffff, 0, 0xffffffff, 0xffffffff + ); + + simd_check_uint32("cmpge" + , simd_cmpge(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + ); + + simd_check_uint32("icmpeq" + , simd_icmpeq(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , 0xffffffff, 0, 0, 0xffffffff + ); + + simd_check_uint32("icmplt" + , simd_icmplt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , 0, 0, 0, 0 + ); + + simd_check_uint32("icmpgt" + , simd_icmpgt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , 0, 0xffffffff, 0xffffffff, 0 + ); +} + +TEST(simd_test) +{ + simd_check_bool("test_any_xyzw" + , simd_test_any_xyzw(simd_ild(0xffffffff, 0, 0, 0) ) + , true + ); + + simd_check_bool("test_all_xyzw" + , simd_test_all_xyzw(simd_ild(0xffffffff, 0, 0xffffffff, 0) ) + , false + ); + + simd_check_bool("test_all_xyzw" + , simd_test_all_xyzw(simd_ild(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) ) + , true + ); + + simd_check_bool("test_all_xw" + , simd_test_all_xw(simd_ild(0xffffffff, 0, 0, 0xffffffff) ) + , true + ); + + simd_check_bool("test_all_xzw" + , simd_test_all_xzw(simd_ild(0xffffffff, 0, 0, 0xffffffff) ) + , false + ); +} + +TEST(simd_load) +{ + simd_check_float("ld" + , simd_ld(0.0f, 1.0f, 2.0f, 3.0f) + , 0.0f, 1.0f, 2.0f, 3.0f + ); + + simd_check_int32("ild" + , simd_ild(uint32_t(-1), 0, 1, 2) + , uint32_t(-1), 0, 1, 2 + ); + + simd_check_int32("ild" + , simd_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) ) + , uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) + ); + + simd_check_uint32("zero", simd_zero() + , 0, 0, 0, 0 + ); + + simd_check_uint32("isplat", simd_isplat(0x80000001) + , 0x80000001, 0x80000001, 0x80000001, 0x80000001 + ); + + simd_check_float("isplat", simd_splat(1.0f) + , 1.0f, 1.0f, 1.0f, 1.0f + ); +} + +TEST(simd_arithmetic) +{ + simd_check_float("madd" + , simd_madd(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(4.0f, 5.0f, 6.0f, 7.0f), simd_ld(8.0f, 9.0f, 10.0f, 11.0f) ) + , 8.0f, 14.0f, 22.0f, 32.0f + ); + + simd_check_float("cross3" + , simd_cross3(simd_ld(1.0f, 0.0f, 0.0f, 0.0f), simd_ld(0.0f, 1.0f, 0.0f, 0.0f) ) + , 0.0f, 0.0f, 1.0f, 0.0f + ); +} + +TEST(simd_sqrt) +{ + simd_check_float("simd_sqrt" + , simd_sqrt(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + , 1.0f, 4.0f, 256.0f, 351.363060096f + ); + + simd_check_float("simd_sqrt_nr_ni" + , simd_sqrt_nr_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + , 1.0f, 4.0f, 256.0f, 351.363060096f + ); + + simd_check_float("simd_sqrt_nr1_ni" + , simd_sqrt_nr1_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + , 1.0f, 4.0f, 256.0f, 351.363060096f + ); +} + +TEST(float4) +{ + const simd128_t isplat = simd_isplat(0x80000001); + simd_check_uint32("sll" + , simd_sll(isplat, 1) + , 0x00000002, 0x00000002, 0x00000002, 0x00000002 + ); + + simd_check_uint32("srl" + , simd_srl(isplat, 1) + , 0x40000000, 0x40000000, 0x40000000, 0x40000000 + ); + + simd_check_uint32("sra" + , simd_sra(isplat, 1) + , 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000 + ); + + simd_check_uint32("and" + , simd_and(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) + , 0, 0, 0, 0 + ); + + simd_check_uint32("or " + , simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) + , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) + ); + + simd_check_uint32("xor" + , simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) + , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) + ); + + simd_check_int32("imin" + , simd_imin(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) + , uint32_t(-1), 1, uint32_t(-2), 1 + ); + + simd_check_float("min" + , simd_min(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) + , -1.0f, 1.0f, -2.0f, 1.0f + ); + + simd_check_int32("imax" + , simd_imax(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) + , 0, 2, 2, 3 + ); + + simd_check_float("max" + , simd_max(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) + , 0.0f, 2.0f, 2.0f, 3.0f + ); +}