From 4eb80393d11e4cfa65be28bc01057d51d99863a2 Mon Sep 17 00:00:00 2001 From: Branimir Karadzic Date: Tue, 3 Apr 2012 20:17:55 -0700 Subject: [PATCH] Initial commit. --- LICENSE | 22 ++ README.md | 37 +++ include/bx/blockalloc.h | 99 +++++++ include/bx/bx.h | 23 ++ include/bx/commandline.h | 151 ++++++++++ include/bx/countof.h | 19 ++ include/bx/cpu.h | 110 +++++++ include/bx/debug.h | 31 ++ include/bx/endian.h | 71 +++++ include/bx/float4_neon.h | 227 +++++++++++++++ include/bx/float4_ni.h | 407 ++++++++++++++++++++++++++ include/bx/float4_ref.h | 522 ++++++++++++++++++++++++++++++++++ include/bx/float4_sse.h | 400 ++++++++++++++++++++++++++ include/bx/float4_swizzle.inl | 266 +++++++++++++++++ include/bx/float4_t.h | 22 ++ include/bx/float4x4_t.h | 168 +++++++++++ include/bx/foreach.h | 71 +++++ include/bx/handlealloc.h | 83 ++++++ include/bx/hash.h | 90 ++++++ include/bx/macros.h | 62 ++++ include/bx/maputil.h | 29 ++ include/bx/mutex.h | 171 +++++++++++ include/bx/os.h | 46 +++ include/bx/platform.h | 86 ++++++ include/bx/ringbuffer.h | 313 ++++++++++++++++++++ include/bx/rng.h | 97 +++++++ include/bx/sem.h | 107 +++++++ include/bx/spscqueue.h | 152 ++++++++++ include/bx/timer.h | 53 ++++ include/bx/uint32_t.h | 454 +++++++++++++++++++++++++++++ 30 files changed, 4389 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 include/bx/blockalloc.h create mode 100644 include/bx/bx.h create mode 100644 include/bx/commandline.h create mode 100644 include/bx/countof.h create mode 100644 include/bx/cpu.h create mode 100644 include/bx/debug.h create mode 100644 include/bx/endian.h create mode 100644 include/bx/float4_neon.h create mode 100644 include/bx/float4_ni.h create mode 100644 include/bx/float4_ref.h create mode 100644 include/bx/float4_sse.h create mode 100644 include/bx/float4_swizzle.inl create mode 100644 include/bx/float4_t.h create mode 100644 include/bx/float4x4_t.h create mode 100644 include/bx/foreach.h create mode 100644 include/bx/handlealloc.h create mode 100644 include/bx/hash.h create mode 100644 include/bx/macros.h create mode 100644 include/bx/maputil.h create mode 100644 include/bx/mutex.h create mode 100644 include/bx/os.h create mode 100644 include/bx/platform.h create mode 100644 include/bx/ringbuffer.h create mode 100644 include/bx/rng.h create mode 100644 include/bx/sem.h create mode 100644 include/bx/spscqueue.h create mode 100644 include/bx/timer.h create mode 100644 include/bx/uint32_t.h diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ddd1deb --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright 2010-2012 Branimir Karadzic. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0863bd6 --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +bx +== + +Base library. + +Contact +------- + +Twitter @bkaradzic + +Web http://www.stuckingeometry.com + +License +------- + +Copyright 2010-2012 Branimir Karadzic. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/include/bx/blockalloc.h b/include/bx/blockalloc.h new file mode 100644 index 0000000..e7d0704 --- /dev/null +++ b/include/bx/blockalloc.h @@ -0,0 +1,99 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_BLOCKALLOC_H__ +#define __BX_BLOCKALLOC_H__ + +#include "bx.h" + +namespace bx +{ + class BlockAlloc + { + public: + static const uint16_t invalidIndex = 0xffff; + static const uint32_t minElementSize = 2; + + BlockAlloc() + : m_data(NULL) + , m_num(0) + , m_size(0) + , m_numFree(0) + , m_freeIndex(invalidIndex) + { + } + + BlockAlloc(void* _data, uint16_t _num, uint16_t _size) + : m_data(_data) + , m_num(_num) + , m_size(_size) + , m_numFree(_num) + , m_freeIndex(0) + { + char* data = (char*)_data; + uint16_t* index = (uint16_t*)_data; + for (uint16_t ii = 0; ii < m_num-1; ++ii) + { + *index = ii+1; + data += m_size; + index = (uint16_t*)data; + } + *index = invalidIndex; + } + + ~BlockAlloc() + { + } + + void* alloc() + { + if (invalidIndex == m_freeIndex) + { + return NULL; + } + + void* obj = ( (char*)m_data) + m_freeIndex*m_size; + m_freeIndex = *( (uint16_t*)obj); + --m_numFree; + + return obj; + } + + void free(void* _obj) + { + uint16_t index = getIndex(_obj); + BX_CHECK(index >= 0 && index < m_num, "index %d, m_num %d", index, m_num); + + *( (uint16_t*)_obj) = m_freeIndex; + m_freeIndex = index; + ++m_numFree; + } + + uint16_t getIndex(void* _obj) const + { + return (uint16_t)( ( (char*)_obj - (char*)m_data ) / m_size); + } + + uint16_t getNumFree() const + { + return m_numFree; + } + + void* getFromIndex(uint16_t _index) + { + return (char*)m_data + _index*m_size; + } + + private: + void* m_data; + uint16_t m_num; + uint16_t m_size; + uint16_t m_numFree; + uint16_t m_freeIndex; + }; + +} // namespace bx + +#endif // __BX_BLOCKALLOC_H__ diff --git a/include/bx/bx.h b/include/bx/bx.h new file mode 100644 index 0000000..1907430 --- /dev/null +++ b/include/bx/bx.h @@ -0,0 +1,23 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_H__ +#define __BX_H__ + +#include "platform.h" +#include "macros.h" + +namespace bx +{ +}// namespace bx + +#ifndef BX_NAMESPACE +# define BX_NAMESPACE 0 +#elif BX_NAMESPACE +using namespace bx; +#endif // BX_NAMESPACE + +#endif // __BX_H__ + diff --git a/include/bx/commandline.h b/include/bx/commandline.h new file mode 100644 index 0000000..5431e73 --- /dev/null +++ b/include/bx/commandline.h @@ -0,0 +1,151 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_COMMANDLINE_H__ +#define __BX_COMMANDLINE_H__ + +#include "bx.h" + +namespace bx +{ + class CommandLine + { + public: + CommandLine() + : m_argc(__argc) + , m_argv(__argv) + { + } + + CommandLine(int _argc, char const* const* _argv) + : m_argc(_argc) + , m_argv(_argv) + { + } + + const char* findOption(const char _short, const char* _long = NULL, int _numParams = 1) + { + const char* result = _findOption(_short, _long, _numParams); + return result; + } + + bool hasArg(const char _short, const char* _long = NULL) + { + const char* arg = findOption(_short, _long, 0); + return NULL != arg; + } + + bool hasArg(const char* _long) + { + const char* arg = findOption('\0', _long, 0); + return NULL != arg; + } + + bool hasArg(const char*& _value, const char _short, const char* _long = NULL) + { + const char* arg = findOption(_short, _long, 1); + _value = arg; + return NULL != arg; + } + + bool hasArg(int& _value, const char _short, const char* _long = NULL) + { + const char* arg = findOption(_short, _long, 1); + if (NULL != arg) + { + _value = atoi(arg); + return true; + } + + return false; + } + + bool hasArg(unsigned int& _value, const char _short, const char* _long = NULL) + { + const char* arg = findOption(_short, _long, 1); + if (NULL != arg) + { + _value = atoi(arg); + return true; + } + + return false; + } + + bool hasArg(bool& _value, const char _short, const char* _long = NULL) + { + const char* arg = findOption(_short, _long, 1); + if (NULL != arg) + { + if ('0' == *arg || _stricmp(arg, "false") ) + { + _value = false; + } + else if ('0' != *arg || _stricmp(arg, "true") ) + { + _value = true; + } + + return true; + } + + return false; + } + + private: + const char* _findOption(const char _short, const char* _long, int _numParams) + { + for (int ii = 0; ii < m_argc; ++ii) + { + const char* arg = m_argv[ii]; + if ('-' == *arg) + { + ++arg; + if (_short == *arg) + { + if (1 == strlen(arg) ) + { + if (0 == _numParams) + { + return ""; + } + else if (ii+_numParams < m_argc + && '-' != *m_argv[ii+1] ) + { + return m_argv[ii+1]; + } + + return NULL; + } + } + else if (NULL != _long + && '-' == *arg + && 0 == _stricmp(arg+1, _long) ) + { + if (0 == _numParams) + { + return ""; + } + else if (ii+_numParams < m_argc + && '-' != *m_argv[ii+1] ) + { + return m_argv[ii+1]; + } + + return NULL; + } + } + } + + return NULL; + } + + int m_argc; + char const* const* m_argv; + }; + +} // namespace bx + +#endif /// __BX_COMMANDLINE_H__ diff --git a/include/bx/countof.h b/include/bx/countof.h new file mode 100644 index 0000000..984f3b9 --- /dev/null +++ b/include/bx/countof.h @@ -0,0 +1,19 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_COUNTOF_H__ +#define __BX_COUNTOF_H__ + +#include "bx.h" + +namespace bx +{ + // http://cnicholson.net/2011/01/stupid-c-tricks-a-better-sizeof_array/ + template char (&COUNTOF_REQUIRES_ARRAY_ARGUMENT(const T(&)[N]) )[N]; +#define countof(x) sizeof(bx::COUNTOF_REQUIRES_ARRAY_ARGUMENT(x) ) + +} // namespace bx + +#endif // __BX_COUNTOF_H__ diff --git a/include/bx/cpu.h b/include/bx/cpu.h new file mode 100644 index 0000000..76b310f --- /dev/null +++ b/include/bx/cpu.h @@ -0,0 +1,110 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_CPU_H__ +#define __BX_CPU_H__ + +#include "bx.h" + +#if BX_COMPILER_MSVC +# if BX_PLATFORM_XBOX360 +# include +# include +# else +# include // math.h is included because VS bitches: + // warning C4985: 'ceil': attributes not present on previous declaration. + // must be included before intrin.h. +# include +# include +# endif // !BX_PLATFORM_XBOX360 +extern "C" void _ReadBarrier(); +extern "C" void _WriteBarrier(); +extern "C" void _ReadWriteBarrier(); +# pragma intrinsic(_ReadBarrier) +# pragma intrinsic(_WriteBarrier) +# pragma intrinsic(_ReadWriteBarrier) +# pragma intrinsic(_InterlockedIncrement) +# pragma intrinsic(_InterlockedDecrement) +#endif // BX_COMPILER_MSVC + +namespace bx +{ +#if BX_COMPILER_MSVC +# define BX_CACHE_LINE_ALIGN_MARKER() __declspec(align(BX_CACHE_LINE_SIZE) ) struct {} +#else +# define BX_CACHE_LINE_ALIGN_MARKER() struct {} __attribute__( (__aligned__(BX_CACHE_LINE_SIZE) ) ) +#endif // BX_COMPILER_ + +#define BX_CACHE_LINE_ALIGN(_def) BX_CACHE_LINE_ALIGN_MARKER(); _def; BX_CACHE_LINE_ALIGN_MARKER() + + inline void readBarrier() + { +#if BX_COMPILER_MSVC + _ReadBarrier(); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + asm volatile("":::"memory"); +#endif // BX_COMPILER + } + + inline void writeBarrier() + { +#if BX_COMPILER_MSVC + _WriteBarrier(); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + asm volatile("":::"memory"); +#endif // BX_COMPILER + } + + inline void readWriteBarrier() + { +#if BX_COMPILER_MSVC + _ReadWriteBarrier(); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + asm volatile("":::"memory"); +#endif // BX_COMPILER + } + + inline void memoryBarrier() + { +#if BX_PLATFORM_XBOX360 + __lwsync(); +#elif BX_COMPILER_MSVC + _mm_mfence(); +#else + __sync_synchronize(); +// asm volatile("mfence":::"memory"); +#endif // BX_COMPILER + } + + inline int32_t atomicIncr(volatile void* _var) + { +#if BX_COMPILER_MSVC + return _InterlockedIncrement( (volatile LONG*)(_var) ); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + return __sync_fetch_and_add( (volatile int32_t*)_var, 1); +#endif // BX_COMPILER + } + + inline int32_t atomicDecr(volatile void* _var) + { +#if BX_COMPILER_MSVC + return _InterlockedDecrement( (volatile LONG*)(_var) ); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + return __sync_fetch_and_sub( (volatile int32_t*)_var, 1); +#endif // BX_COMPILER + } + + inline void* atomicExchangePtr(void** _target, void* _ptr) + { +#if BX_COMPILER_MSVC + return InterlockedExchangePointer(_target, _ptr); +#elif BX_COMPILER_GCC || BX_COMPILER_CLANG + return __sync_lock_test_and_set(_target, _ptr); +#endif // BX_COMPILER + } + +} // namespace bx + +#endif // __BX_CPU_H__ diff --git a/include/bx/debug.h b/include/bx/debug.h new file mode 100644 index 0000000..1ff7a1e --- /dev/null +++ b/include/bx/debug.h @@ -0,0 +1,31 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_DEBUG_H__ +#define __BX_DEBUG_H__ + +#include "bx.h" + +namespace bx +{ + inline void debugBreak() + { +#if BX_COMPILER_MSVC + __debugbreak(); +#elif BX_CPU_ARM + asm("bkpt 0"); +#elif !BX_PLATFORM_NACL && BX_CPU_X86 && (BX_COMPILER_GCC || BX_COMPILER_CLANG) + // NaCl doesn't like int 3: + // NativeClient: NaCl module load failed: Validation failure. File violates Native Client safety rules. + __asm__ ("int $3"); +#else // cross platform implementation + int* int3 = (int*)3L; + *int3 = 3; +#endif // BX + } + +} // namespace bx + +#endif // __BX_DEBUG_H__ diff --git a/include/bx/endian.h b/include/bx/endian.h new file mode 100644 index 0000000..0bca59d --- /dev/null +++ b/include/bx/endian.h @@ -0,0 +1,71 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_ENDIAN_H__ +#define __BX_ENDIAN_H__ + +#include "bx.h" + +namespace bx +{ + inline uint16_t endianSwap(uint16_t _in) + { + return (_in>>8) | (_in<<8); + } + + inline uint32_t endianSwap(uint32_t _in) + { + return (_in>>24) | (_in<<24) + | ( (_in&0x00ff0000)>>8) | ( (_in&0x0000ff00)<<8) + ; + } + + inline uint64_t endianSwap(uint64_t _in) + { + return (_in>>56) | (_in<<56) + | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40) + | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24) + | ( (_in&UINT64_C(0x000000ff00000000) )>>8) | ( (_in&UINT64_C(0x00000000ff000000) )<<8) + ; + } + + inline int16_t endianSwap(int16_t _in) + { + return (int16_t)endianSwap( (uint16_t)_in); + } + + inline int32_t endianSwap(int32_t _in) + { + return (int32_t)endianSwap( (uint32_t)_in); + } + + inline int64_t endianSwap(int64_t _in) + { + return (int64_t)endianSwap( (uint64_t)_in); + } + + template + inline Ty littleEndian(Ty& _in) + { +#if BX_CPU_ENDIAN_BIG + endianSwap(_in); +#else + return _in; +#endif // BX_CPU_ENDIAN_BIG + } + + template + inline Ty bigEndian(Ty& _in) + { +#if BX_CPU_ENDIAN_LITTLE + return endianSwap(_in); +#else + return _in; +#endif // BX_CPU_ENDIAN_LITTLE + } + +} // namespace bx + +#endif // __BX_ENDIAN_H__ diff --git a/include/bx/float4_neon.h b/include/bx/float4_neon.h new file mode 100644 index 0000000..479bcba --- /dev/null +++ b/include/bx/float4_neon.h @@ -0,0 +1,227 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_NEON_H__ +#define __BX_FLOAT4_NEON_H__ + +#include + +namespace bx +{ + +// Reference: +// http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html +// http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/ +// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +// http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/ +// http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/ +// http://blogs.arm.com/software-enablement/684-coding-for-neon-part-5-rearranging-vectors/ + + typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) ); + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ + { \ + float4_t result; \ + result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ + result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ + result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ + result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \ + return result; \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + + BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) + { + return _a; //_mm_movelh_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) + { + return _a; //_mm_movelh_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) + { + return _a; //_mm_movehl_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) + { + return _a; //_mm_movehl_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) + { + return _a; //_mm_unpacklo_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) + { + return _a; //_mm_unpacklo_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) + { + return _a; //_mm_unpackhi_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) + { + return _a; //_mm_unpackhi_ps(_b, _a); + } + + BX_FLOAT4_INLINE float float4_x(float4_t _a) + { + return _a.fxyzw[0]; + } + + BX_FLOAT4_INLINE float float4_y(float4_t _a) + { + return _a.fxyzw[1]; + } + + BX_FLOAT4_INLINE float float4_z(float4_t _a) + { + return _a.fxyzw[2]; + } + + BX_FLOAT4_INLINE float float4_w(float4_t _a) + { + return _a.fxyzw[3]; + } + + BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + { + const float32_t val[4] = {_x, _y, _z, _w}; + return __builtin_neon_vld1v4sf(val); + } + + BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + const uint32_t val[4] = {_x, _y, _z, _w}; + return (float4_t)__builtin_neon_vld1v4si( (const __builtin_neon_si*)val); + } + + BX_FLOAT4_INLINE float4_t float4_splat(float _a) + { + return __builtin_neon_vdup_nv4sf(_a); + } + + BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a) + { + return (float4_t)__builtin_neon_vdup_nv4si( (__builtin_neon_si)_a); + } + + BX_FLOAT4_INLINE float4_t float4_zero() + { + return vdupq_n_f32(0.0f); + } + + BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b) + { + return vaddq_f32(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b) + { + return vsubq_f32(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b) + { + return vmulq_f32(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a) + { + return vrecpeq_f32(_a); + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a) + { + return vrsqrteq_f32(_a); + } + + BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b) + { + return (float4_t)__builtin_neon_vandv4si( (int32x4_t)_a, (int32x4_t)_b, 0); + } + + //BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b) + //{ + // return _mm_andnot_ps(_b, _a); + //} + + BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b) + { + return (float4_t)__builtin_neon_vorrv4si( (int32x4_t)_a, (int32x4_t)_b, 0); + } + + BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b); + const uint32x4_t add = vaddq_u32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_u32(add); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b) + { + const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b); + const uint32x4_t sub = vsubq_u32(tmp0, tmp1); + const float4_t result = vreinterpretq_f32_u32(sub); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count) + { + const uint32x4_t tmp = vreinterpretq_u32_f32(_a); + const uint32x4_t shift = vshlq_n_u32(tmp, _count); + const float4_t result = vreinterpretq_f32_u32(shift); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count) + { + const uint32x4_t tmp = vreinterpretq_i32_f32(_a); + const uint32x4_t shift = (uint32x4_t)__builtin_neon_vshr_nv4si( (int32x4_t)tmp, _count, 0); + const float4_t result = vreinterpretq_f32_u32(shift); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t shift = __builtin_neon_vshr_nv4si(a, _count, 1); + const float4_t result = vreinterpretq_f32_s32(shift); + + return result; + } + +} // namespace bx + +#define float4_div_nr float4_div_nr_ni +#define float4_div float4_div_nr_ni +#include "float4_ni.h" + +#endif // __BX_FLOAT4_NEON_H__ diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h new file mode 100644 index 0000000..328c51c --- /dev/null +++ b/include/bx/float4_ni.h @@ -0,0 +1,407 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_NI_H__ +#define __BX_FLOAT4_NI_H__ + +namespace bx +{ + BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b) + { + const float4_t xAyB = float4_shuf_xAyB(_a, _b); + const float4_t zCwD = float4_shuf_zCwD(_a, _b); + const float4_t result = float4_shuf_xyAB(xAyB, zCwD); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b) + { + const float4_t xAyB = float4_shuf_xAyB(_a, _b); + const float4_t zCwD = float4_shuf_zCwD(_a, _b); + const float4_t result = float4_shuf_zwCD(xAyB, zCwD); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c) + { + const float4_t mul = float4_mul(_a, _b); + const float4_t result = float4_add(mul, _c); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c) + { + const float4_t mul = float4_mul(_a, _b); + const float4_t result = float4_sub(_c, mul); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b) + { + const float4_t oneish = float4_isplat(0x3f800001); + const float4_t est = float4_rcp_est(_b); + const float4_t iter0 = float4_mul(_a, est); + const float4_t tmp1 = float4_nmsub(_b, est, oneish); + const float4_t result = float4_madd(tmp1, iter0, iter0); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a) + { + const float4_t one = float4_splat(1.0f); + const float4_t result = float4_div(one, _a); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a) + { + const float4_t zwxy = float4_swiz_zwxy(_a); + const float4_t tmp0 = float4_or(_a, zwxy); + const float4_t tmp1 = float4_swiz_yyyy(_a); + const float4_t tmp2 = float4_or(tmp0, tmp1); + const float4_t mf000 = float4_ild(-1, 0, 0, 0); + const float4_t result = float4_and(tmp2, mf000); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b) + { + const float4_t aorb = float4_or(_a, _b); + const float4_t mffff = float4_isplat(-1); + const float4_t result = float4_xor(aorb, mffff); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a) + { + const float4_t zero = float4_zero(); + const float4_t result = float4_sub(zero, _a); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b) + { + const float4_t sel_a = float4_and(_a, _mask); + const float4_t sel_b = float4_andc(_b, _mask); + const float4_t result = float4_or(sel_a, sel_b); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b) + { + const float4_t mask = float4_sra(_test, 31); + const float4_t result = float4_selb(mask, _a, _b); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a) + { + const float4_t mffff = float4_isplat(-1); + const float4_t result = float4_xor(_a, mffff); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a) + { + const float4_t a_neg = float4_neg(_a); + const float4_t result = float4_max(a_neg, _a); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max) + { + const float4_t tmp = float4_min(_a, _max); + const float4_t result = float4_max(tmp, _min); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s) + { + const float4_t ba = float4_sub(_b, _a); + const float4_t result = float4_madd(_s, ba, _a); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a) + { + const float4_t half = float4_splat(0.5f); + const float4_t one = float4_splat(1.0f); + const float4_t zero = float4_zero(); + const float4_t tmp0 = float4_rsqrt_est(_a); + const float4_t tmp1 = float4_madd(tmp0, _a, zero); + const float4_t tmp2 = float4_madd(tmp1, half, zero); + const float4_t tmp3 = float4_nmsub(tmp0, tmp1, one); + const float4_t result = float4_madd(tmp3, tmp2, tmp1); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a) + { + const float4_t one = float4_splat(1.0f); + const float4_t sqrt = float4_sqrt(_a); + const float4_t result = float4_div(one, sqrt); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a) + { + const float4_t rsqrt = float4_rsqrt_est(_a); + const float4_t iter0 = float4_mul(_a, rsqrt); + const float4_t iter1 = float4_mul(iter0, rsqrt); + const float4_t half = float4_splat(0.5f); + const float4_t half_rsqrt = float4_mul(half, rsqrt); + const float4_t three = float4_splat(3.0f); + const float4_t three_sub_iter1 = float4_sub(three, iter1); + const float4_t result = float4_mul(half_rsqrt, three_sub_iter1); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a) + { + const float4_t half = float4_splat(0.5f); + const float4_t ah = float4_mul(half, _a); + const float4_t ashift = float4_sra(_a, 1); + const float4_t magic = float4_isplat(0x5f3759df); + const float4_t msuba = float4_isub(magic, ashift); + const float4_t msubasq = float4_mul(msuba, msuba); + const float4_t tmp0 = float4_splat(1.5f); + const float4_t tmp1 = float4_mul(ah, msubasq); + const float4_t tmp2 = float4_sub(tmp0, tmp1); + const float4_t result = float4_mul(msuba, tmp2); + + return result; + } + + namespace float4_logexp_detail + { + BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b) + { + return float4_splat(_b); + } + + BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c) + { + const float4_t bbbb = float4_splat(_b); + const float4_t poly0 = float4_poly0(_a, _c); + const float4_t result = float4_madd(poly0, _a, bbbb); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d) + { + const float4_t bbbb = float4_splat(_b); + const float4_t poly = float4_poly1(_a, _c, _d); + const float4_t result = float4_madd(poly, _a, bbbb); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e) + { + const float4_t bbbb = float4_splat(_b); + const float4_t poly = float4_poly2(_a, _c, _d, _e); + const float4_t result = float4_madd(poly, _a, bbbb); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f) + { + const float4_t bbbb = float4_splat(_b); + const float4_t poly = float4_poly3(_a, _c, _d, _e, _f); + const float4_t result = float4_madd(poly, _a, bbbb); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g) + { + const float4_t bbbb = float4_splat(_b); + const float4_t poly = float4_poly4(_a, _c, _d, _e, _f, _g); + const float4_t result = float4_madd(poly, _a, bbbb); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a) + { +#if 1 + const float4_t result = float4_poly5(_a + , 3.11578814719469302614f, -3.32419399085241980044f + , 2.59883907202499966007f, -1.23152682416275988241f + , 0.318212422185251071475f, -0.0344359067839062357313f + ); +#elif 0 + const float4_t result = float4_poly4(_a + , 2.8882704548164776201f, -2.52074962577807006663f + , 1.48116647521213171641f, -0.465725644288844778798f + , 0.0596515482674574969533f + ); +#elif 0 + const float4_t result = float4_poly3(_a + , 2.61761038894603480148f, -1.75647175389045657003f + , 0.688243882994381274313f, -0.107254423828329604454f + ); +#else + const float4_t result = float4_poly2(_a + , 2.28330284476918490682f, -1.04913055217340124191f + , 0.204446009836232697516f + ); +#endif + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a) + { +#if 1 + const float4_t result = float4_poly5(_a + , 9.9999994e-1f, 6.9315308e-1f + , 2.4015361e-1f, 5.5826318e-2f + , 8.9893397e-3f, 1.8775767e-3f + ); +#elif 0 + const float4_t result = float4_poly4(_a + , 1.0000026f, 6.9300383e-1f + , 2.4144275e-1f, 5.2011464e-2f + , 1.3534167e-2f + ); +#elif 0 + const float4_t result = float4_poly3(_a + , 9.9992520e-1f, 6.9583356e-1f + , 2.2606716e-1f, 7.8024521e-2f + ); +#else + const float4_t result = float4_poly2(_a + , 1.0017247f, 6.5763628e-1f + , 3.3718944e-1f + ); +#endif // 0 + + return result; + } + } // namespace float4_internal + + BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a) + { + const float4_t expmask = float4_isplat(0x7f800000); + const float4_t mantmask = float4_isplat(0x007fffff); + const float4_t one = float4_splat(1.0f); + + const float4_t c127 = float4_isplat(127); + const float4_t aexp = float4_and(_a, expmask); + const float4_t aexpsr = float4_srl(aexp, 23); + const float4_t tmp0 = float4_isub(aexpsr, c127); + const float4_t exp = float4_itof(tmp0); + + const float4_t amask = float4_and(_a, mantmask); + const float4_t mant = float4_or(amask, one); + + const float4_t poly = float4_logexp_detail::float4_logpoly(mant); + + const float4_t mandiff = float4_sub(mant, one); + const float4_t result = float4_madd(poly, mandiff, exp); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a) + { + const float4_t min = float4_splat( 129.0f); + const float4_t max = float4_splat(-126.99999f); + const float4_t tmp0 = float4_min(_a, min); + const float4_t aaaa = float4_max(tmp0, max); + + const float4_t half = float4_splat(0.5f); + const float4_t tmp2 = float4_sub(aaaa, half); + const float4_t ipart = float4_ftoi(tmp2); + const float4_t iround = float4_itof(ipart); + const float4_t fpart = float4_sub(aaaa, iround); + + const float4_t c127 = float4_isplat(127); + const float4_t tmp5 = float4_iadd(ipart, c127); + const float4_t expipart = float4_sll(tmp5, 23); + + const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart); + + const float4_t result = float4_mul(expipart, expfpart); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b) + { + const float4_t alog2 = float4_log2(_a); + const float4_t alog2b = float4_mul(alog2, _b); + const float4_t result = float4_exp2(alog2b); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b) + { + const float4_t xyzw = float4_mul(_a, _b); + const float4_t xxxx = float4_swiz_xxxx(xyzw); + const float4_t yyyy = float4_swiz_yyyy(xyzw); + const float4_t zzzz = float4_swiz_zzzz(xyzw); + const float4_t tmp1 = float4_add(xxxx, yyyy); + const float4_t result = float4_add(zzzz, tmp1); + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b) + { + const float4_t a_yzxw = float4_swiz_yzxw(_a); + const float4_t a_zxyw = float4_swiz_zxyw(_a); + const float4_t b_zxyw = float4_swiz_zxyw(_b); + const float4_t b_yzxw = float4_swiz_yzxw(_b); + const float4_t tmp = float4_mul(a_yzxw, b_zxyw); + const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a) + { + const float4_t dot3 = float4_dot3(_a, _a); + const float4_t invSqrt = float4_rsqrt(dot3); + const float4_t result = float4_mul(_a, invSqrt); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b) + { + const float4_t xyzw = float4_mul(_a, _b); + const float4_t yzwx = float4_swiz_yzwx(xyzw); + const float4_t tmp0 = float4_add(xyzw, yzwx); + const float4_t zwxy = float4_swiz_zwxy(tmp0); + const float4_t result = float4_add(tmp0, zwxy); + + return result; + } + +} // namespace bx + +#endif // __BX_FLOAT4_NI_H__ diff --git a/include/bx/float4_ref.h b/include/bx/float4_ref.h new file mode 100644 index 0000000..bf74b40 --- /dev/null +++ b/include/bx/float4_ref.h @@ -0,0 +1,522 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_REF_H__ +#define __BX_FLOAT4_REF_H__ + +#include // sqrtf + +namespace bx +{ + typedef union float4_t + { + int32_t ixyzw[4]; + uint32_t uxyzw[4]; + float fxyzw[4]; + + } float4_t; + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ + { \ + float4_t result; \ + result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ + result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ + result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ + result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \ + return result; \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _mask) \ + BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | (_test.uxyzw[0]>>31) \ + ; \ + return 0 != (tmp&(_mask) ); \ + } \ + \ + BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ + { \ + uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ + | ( (_test.uxyzw[2]>>31)<<2) \ + | ( (_test.uxyzw[1]>>31)<<1) \ + | (_test.uxyzw[0]>>31) \ + ; \ + return (_mask) == (tmp&(_mask) ); \ + } + +IMPLEMENT_TEST(x , 0x1); +IMPLEMENT_TEST(y , 0x2); +IMPLEMENT_TEST(xy , 0x3); +IMPLEMENT_TEST(z , 0x4); +IMPLEMENT_TEST(xz , 0x5); +IMPLEMENT_TEST(yz , 0x6); +IMPLEMENT_TEST(xyz , 0x7); +IMPLEMENT_TEST(w , 0x8); +IMPLEMENT_TEST(xw , 0x9); +IMPLEMENT_TEST(yw , 0xa); +IMPLEMENT_TEST(xyw , 0xb); +IMPLEMENT_TEST(zw , 0xc); +IMPLEMENT_TEST(xzw , 0xd); +IMPLEMENT_TEST(yzw , 0xe); +IMPLEMENT_TEST(xyzw , 0xf); + +#undef IMPLEMENT_TEST + + BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1]; + result.uxyzw[2] = _b.uxyzw[0]; + result.uxyzw[3] = _b.uxyzw[1]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _b.uxyzw[0]; + result.uxyzw[1] = _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[0]; + result.uxyzw[3] = _a.uxyzw[1]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _b.uxyzw[2]; + result.uxyzw[1] = _b.uxyzw[3]; + result.uxyzw[2] = _a.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[2]; + result.uxyzw[1] = _a.uxyzw[3]; + result.uxyzw[2] = _b.uxyzw[2]; + result.uxyzw[3] = _b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0]; + result.uxyzw[1] = _b.uxyzw[0]; + result.uxyzw[2] = _a.uxyzw[1]; + result.uxyzw[3] = _b.uxyzw[1]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[1]; + result.uxyzw[1] = _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[0]; + result.uxyzw[3] = _b.uxyzw[0]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[2]; + result.uxyzw[1] = _b.uxyzw[2]; + result.uxyzw[2] = _a.uxyzw[3]; + result.uxyzw[3] = _b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _b.uxyzw[2]; + result.uxyzw[1] = _a.uxyzw[2]; + result.uxyzw[2] = _b.uxyzw[3]; + result.uxyzw[3] = _a.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float float4_x(float4_t _a) + { + return _a.fxyzw[0]; + } + + BX_FLOAT4_INLINE float float4_y(float4_t _a) + { + return _a.fxyzw[1]; + } + + BX_FLOAT4_INLINE float float4_z(float4_t _a) + { + return _a.fxyzw[2]; + } + + BX_FLOAT4_INLINE float float4_w(float4_t _a) + { + return _a.fxyzw[3]; + } + + BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr) + { + return *reinterpret_cast(_ptr); + } + + BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a) + { + *reinterpret_cast(_ptr) = _a; + } + + BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + { + float4_t result; + result.fxyzw[0] = _x; + result.fxyzw[1] = _y; + result.fxyzw[2] = _z; + result.fxyzw[3] = _w; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + float4_t result; + result.uxyzw[0] = _x; + result.uxyzw[1] = _y; + result.uxyzw[2] = _z; + result.uxyzw[3] = _w; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr) + { + float val = *reinterpret_cast(_ptr); + return float4_ld(val, val, val, val); + } + + BX_FLOAT4_INLINE float4_t float4_splat(float _a) + { + return float4_ld(_a, _a, _a, _a); + } + + BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a) + { + return float4_ild(_a, _a, _a, _a); + } + + BX_FLOAT4_INLINE float4_t float4_zero() + { + return float4_ild(0, 0, 0, 0); + } + + BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a) + { + float4_t result; + result.fxyzw[0] = (float)result.ixyzw[0]; + result.fxyzw[1] = (float)result.ixyzw[1]; + result.fxyzw[2] = (float)result.ixyzw[2]; + result.fxyzw[3] = (float)result.ixyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a) + { + float4_t result; + result.ixyzw[0] = (int)result.fxyzw[0]; + result.ixyzw[1] = (int)result.fxyzw[1]; + result.ixyzw[2] = (int)result.fxyzw[2]; + result.ixyzw[3] = (int)result.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_round(float4_t _a) + { + const float4_t tmp = float4_ftoi(_a); + const float4_t result = float4_itof(tmp); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a) + { + float4_t result; + result.fxyzw[0] = 1.0f / _a.fxyzw[0]; + result.fxyzw[1] = 1.0f / _a.fxyzw[1]; + result.fxyzw[2] = 1.0f / _a.fxyzw[2]; + result.fxyzw[3] = 1.0f / _a.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a) + { + float4_t result; + result.fxyzw[0] = sqrtf(_a.fxyzw[0]); + result.fxyzw[1] = sqrtf(_a.fxyzw[1]); + result.fxyzw[2] = sqrtf(_a.fxyzw[2]); + result.fxyzw[3] = sqrtf(_a.fxyzw[3]); + return result; + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a) + { + float4_t result; + result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]); + result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]); + result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]); + result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]); + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0; + result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0; + result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0; + result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b) + { + float4_t result; + result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; + result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; + result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; + result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0]; + result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1]; + result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2]; + result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] << _count; + result.uxyzw[1] = _a.uxyzw[1] << _count; + result.uxyzw[2] = _a.uxyzw[2] << _count; + result.uxyzw[3] = _a.uxyzw[3] << _count; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count) + { + float4_t result; + result.uxyzw[0] = _a.uxyzw[0] >> _count; + result.uxyzw[1] = _a.uxyzw[1] >> _count; + result.uxyzw[2] = _a.uxyzw[2] >> _count; + result.uxyzw[3] = _a.uxyzw[3] >> _count; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count) + { + float4_t result; + result.ixyzw[0] = _a.ixyzw[0] >> _count; + result.ixyzw[1] = _a.ixyzw[1] >> _count; + result.ixyzw[2] = _a.ixyzw[2] >> _count; + result.ixyzw[3] = _a.ixyzw[3] >> _count; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3]; + return result; + } + + BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b) + { + float4_t result; + result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0]; + result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1]; + result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2]; + result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3]; + return result; + } + +} // namespace bx + +#define float4_shuf_xAzC float4_shuf_xAzC_ni +#define float4_shuf_yBwD float4_shuf_yBwD_ni +#define float4_rcp float4_rcp_ni +#define float4_orx float4_orx_ni +#define float4_orc float4_orc_ni +#define float4_neg float4_neg_ni +#define float4_madd float4_madd_ni +#define float4_nmsub float4_nmsub_ni +#define float4_div_nr float4_div_nr_ni +#define float4_selb float4_selb_ni +#define float4_sels float4_sels_ni +#define float4_not float4_not_ni +#define float4_abs float4_abs_ni +#define float4_clamp float4_clamp_ni +#define float4_lerp float4_lerp_ni +#define float4_rsqrt float4_rsqrt_ni +#define float4_rsqrt_nr float4_rsqrt_nr_ni +#define float4_rsqrt_carmack float4_rsqrt_carmack_ni +#define float4_sqrt_nr float4_sqrt_nr_ni +#define float4_log2 float4_log2_ni +#define float4_exp2 float4_exp2_ni +#define float4_pow float4_pow_ni +#define float4_cross3 float4_cross3_ni +#define float4_normalize3 float4_normalize3_ni +#define float4_dot3 float4_dot3_ni +#define float4_dot float4_dot_ni +#include "float4_ni.h" + +#endif // __BX_FLOAT4_REF_H__ diff --git a/include/bx/float4_sse.h b/include/bx/float4_sse.h new file mode 100644 index 0000000..ff14710 --- /dev/null +++ b/include/bx/float4_sse.h @@ -0,0 +1,400 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_SSE_H__ +#define __BX_FLOAT4_SSE_H__ + +#if !defined(__SSE2__) +# error "float4_t requires at least SSE2" +#endif // !defined(__SSE2__) + +#include + +#include // __m128i +#if defined(__SSE4_1__) +# include +#endif // defined(__SSE4_1__) +#include // __m128 + +namespace bx +{ + + typedef __m128 float4_t; + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \ + { \ + return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \ + } + +#include "float4_swizzle.inl" + +#undef IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define IMPLEMENT_TEST(_xyzw, _mask) \ + BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \ + { \ + return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ + } \ + \ + BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \ + { \ + return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ + } + +IMPLEMENT_TEST(x , 0x1); +IMPLEMENT_TEST(y , 0x2); +IMPLEMENT_TEST(xy , 0x3); +IMPLEMENT_TEST(z , 0x4); +IMPLEMENT_TEST(xz , 0x5); +IMPLEMENT_TEST(yz , 0x6); +IMPLEMENT_TEST(xyz , 0x7); +IMPLEMENT_TEST(w , 0x8); +IMPLEMENT_TEST(xw , 0x9); +IMPLEMENT_TEST(yw , 0xa); +IMPLEMENT_TEST(xyw , 0xb); +IMPLEMENT_TEST(zw , 0xc); +IMPLEMENT_TEST(xzw , 0xd); +IMPLEMENT_TEST(yzw , 0xe); +IMPLEMENT_TEST(xyzw , 0xf); + +#undef IMPLEMENT_TEST + + BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b) + { + return _mm_movelh_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b) + { + return _mm_movelh_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b) + { + return _mm_movehl_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b) + { + return _mm_movehl_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b) + { + return _mm_unpacklo_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b) + { + return _mm_unpacklo_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b) + { + return _mm_unpackhi_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b) + { + return _mm_unpackhi_ps(_b, _a); + } + + BX_FLOAT4_INLINE float float4_x(float4_t _a) + { + return _mm_cvtss_f32(_a); + } + + BX_FLOAT4_INLINE float float4_y(float4_t _a) + { + const float4_t yyyy = float4_swiz_yyyy(_a); + const float result = _mm_cvtss_f32(yyyy); + + return result; + } + + BX_FLOAT4_INLINE float float4_z(float4_t _a) + { + const float4_t zzzz = float4_swiz_zzzz(_a); + const float result = _mm_cvtss_f32(zzzz); + + return result; + } + + BX_FLOAT4_INLINE float float4_w(float4_t _a) + { + const float4_t wwww = float4_swiz_wwww(_a); + const float result = _mm_cvtss_f32(wwww); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr) + { + return _mm_load_ps(reinterpret_cast(_ptr) ); + } + + BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a) + { + _mm_store_ps(reinterpret_cast(_ptr), _a); + } + + BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w) + { + return _mm_set_ps(_w, _z, _y, _x); + } + + BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + const __m128i set = _mm_set_epi32(_w, _z, _y, _x); + const float4_t result = _mm_castsi128_ps(set); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr) + { + const float4_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); + const float4_t result = float4_swiz_xxxx(x___); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_splat(float _a) + { + return _mm_set1_ps(_a); + } + + BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a) + { + const __m128i splat = _mm_set1_epi32(_a); + const float4_t result = _mm_castsi128_ps(splat); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_zero() + { + return _mm_setzero_ps(); + } + + BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a) + { + const __m128i itof = _mm_castps_si128(_a); + const float4_t result = _mm_cvtepi32_ps(itof); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a) + { + const __m128i ftoi = _mm_cvtps_epi32(_a); + const float4_t result = _mm_castsi128_ps(ftoi); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_round(float4_t _a) + { +#if defined(__SSE4_1__) + return _mm_round_ps(_a, _MM_FROUND_NINT); +#else + const __m128i round = _mm_cvtps_epi32(_a); + const float4_t result = _mm_cvtepi32_ps(round); + + return result; +#endif // defined(__SSE4_1__) + } + + BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b) + { + return _mm_add_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b) + { + return _mm_sub_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b) + { + return _mm_mul_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b) + { + return _mm_div_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a) + { + return _mm_rcp_ps(_a); + } + + BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a) + { + return _mm_sqrt_ps(_a); + } + + BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a) + { + return _mm_rsqrt_ps(_a); + } + +#if defined(__SSE4_1__) + BX_FLOAT4_INLINE float4_t float4_dot3(float4_t _a, float4_t _b) + { + return _mm_dp_ps(_a, _b, 0x77); + } + + BX_FLOAT4_INLINE float4_t float4_dot(float4_t _a, float4_t _b) + { + return _mm_dp_ps(_a, _b, 0xFF); + } +#endif // defined(__SSE4__) + + BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b) + { + return _mm_cmpeq_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b) + { + return _mm_cmplt_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b) + { + return _mm_cmple_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b) + { + return _mm_cmpgt_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b) + { + return _mm_cmpge_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b) + { + return _mm_min_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b) + { + return _mm_max_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b) + { + return _mm_and_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b) + { + return _mm_andnot_ps(_b, _a); + } + + BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b) + { + return _mm_or_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b) + { + return _mm_xor_ps(_a, _b); + } + + BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_slli_epi32(a, _count); + const float4_t result = _mm_castsi128_ps(shift); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_srli_epi32(a, _count); + const float4_t result = _mm_castsi128_ps(shift); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i shift = _mm_srai_epi32(a, _count); + const float4_t result = _mm_castsi128_ps(shift); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i add = _mm_add_epi32(a, b); + const float4_t result = _mm_castsi128_ps(add); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i sub = _mm_sub_epi32(a, b); + const float4_t result = _mm_castsi128_ps(sub); + + return result; + } + +} // namespace bx + +#define float4_shuf_xAzC float4_shuf_xAzC_ni +#define float4_shuf_yBwD float4_shuf_yBwD_ni +#define float4_rcp float4_rcp_ni +#define float4_orx float4_orx_ni +#define float4_orc float4_orc_ni +#define float4_neg float4_neg_ni +#define float4_madd float4_madd_ni +#define float4_nmsub float4_nmsub_ni +#define float4_div_nr float4_div_nr_ni +#define float4_selb float4_selb_ni +#define float4_sels float4_sels_ni +#define float4_not float4_not_ni +#define float4_abs float4_abs_ni +#define float4_clamp float4_clamp_ni +#define float4_lerp float4_lerp_ni +#define float4_rsqrt float4_rsqrt_ni +#define float4_rsqrt_nr float4_rsqrt_nr_ni +#define float4_rsqrt_carmack float4_rsqrt_carmack_ni +#define float4_sqrt_nr float4_sqrt_nr_ni +#define float4_log2 float4_log2_ni +#define float4_exp2 float4_exp2_ni +#define float4_pow float4_pow_ni +#define float4_cross3 float4_cross3_ni +#define float4_normalize3 float4_normalize3_ni +#if !defined(__SSE4_1__) +#define float4_dot3 float4_dot3_ni +#define float4_dot float4_dot_ni +#endif // defined(__SSE4_1__) +#include "float4_ni.h" + +#endif // __FLOAT4_SSE_H__ diff --git a/include/bx/float4_swizzle.inl b/include/bx/float4_swizzle.inl new file mode 100644 index 0000000..559cfe0 --- /dev/null +++ b/include/bx/float4_swizzle.inl @@ -0,0 +1,266 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_T_H__ +# error "xmacro file, must be included from float4_*.h" +#endif // __BX_FLOAT4_T_H__ + +// included from float4_t.h +IMPLEMENT_SWIZZLE(x, x, x, x) +IMPLEMENT_SWIZZLE(x, x, x, y) +IMPLEMENT_SWIZZLE(x, x, x, z) +IMPLEMENT_SWIZZLE(x, x, x, w) +IMPLEMENT_SWIZZLE(x, x, y, x) +IMPLEMENT_SWIZZLE(x, x, y, y) +IMPLEMENT_SWIZZLE(x, x, y, z) +IMPLEMENT_SWIZZLE(x, x, y, w) +IMPLEMENT_SWIZZLE(x, x, z, x) +IMPLEMENT_SWIZZLE(x, x, z, y) +IMPLEMENT_SWIZZLE(x, x, z, z) +IMPLEMENT_SWIZZLE(x, x, z, w) +IMPLEMENT_SWIZZLE(x, x, w, x) +IMPLEMENT_SWIZZLE(x, x, w, y) +IMPLEMENT_SWIZZLE(x, x, w, z) +IMPLEMENT_SWIZZLE(x, x, w, w) +IMPLEMENT_SWIZZLE(x, y, x, x) +IMPLEMENT_SWIZZLE(x, y, x, y) +IMPLEMENT_SWIZZLE(x, y, x, z) +IMPLEMENT_SWIZZLE(x, y, x, w) +IMPLEMENT_SWIZZLE(x, y, y, x) +IMPLEMENT_SWIZZLE(x, y, y, y) +IMPLEMENT_SWIZZLE(x, y, y, z) +IMPLEMENT_SWIZZLE(x, y, y, w) +IMPLEMENT_SWIZZLE(x, y, z, x) +IMPLEMENT_SWIZZLE(x, y, z, y) +IMPLEMENT_SWIZZLE(x, y, z, z) +// IMPLEMENT_SWIZZLE(x, y, z, w) +IMPLEMENT_SWIZZLE(x, y, w, x) +IMPLEMENT_SWIZZLE(x, y, w, y) +IMPLEMENT_SWIZZLE(x, y, w, z) +IMPLEMENT_SWIZZLE(x, y, w, w) +IMPLEMENT_SWIZZLE(x, z, x, x) +IMPLEMENT_SWIZZLE(x, z, x, y) +IMPLEMENT_SWIZZLE(x, z, x, z) +IMPLEMENT_SWIZZLE(x, z, x, w) +IMPLEMENT_SWIZZLE(x, z, y, x) +IMPLEMENT_SWIZZLE(x, z, y, y) +IMPLEMENT_SWIZZLE(x, z, y, z) +IMPLEMENT_SWIZZLE(x, z, y, w) +IMPLEMENT_SWIZZLE(x, z, z, x) +IMPLEMENT_SWIZZLE(x, z, z, y) +IMPLEMENT_SWIZZLE(x, z, z, z) +IMPLEMENT_SWIZZLE(x, z, z, w) +IMPLEMENT_SWIZZLE(x, z, w, x) +IMPLEMENT_SWIZZLE(x, z, w, y) +IMPLEMENT_SWIZZLE(x, z, w, z) +IMPLEMENT_SWIZZLE(x, z, w, w) +IMPLEMENT_SWIZZLE(x, w, x, x) +IMPLEMENT_SWIZZLE(x, w, x, y) +IMPLEMENT_SWIZZLE(x, w, x, z) +IMPLEMENT_SWIZZLE(x, w, x, w) +IMPLEMENT_SWIZZLE(x, w, y, x) +IMPLEMENT_SWIZZLE(x, w, y, y) +IMPLEMENT_SWIZZLE(x, w, y, z) +IMPLEMENT_SWIZZLE(x, w, y, w) +IMPLEMENT_SWIZZLE(x, w, z, x) +IMPLEMENT_SWIZZLE(x, w, z, y) +IMPLEMENT_SWIZZLE(x, w, z, z) +IMPLEMENT_SWIZZLE(x, w, z, w) +IMPLEMENT_SWIZZLE(x, w, w, x) +IMPLEMENT_SWIZZLE(x, w, w, y) +IMPLEMENT_SWIZZLE(x, w, w, z) +IMPLEMENT_SWIZZLE(x, w, w, w) +IMPLEMENT_SWIZZLE(y, x, x, x) +IMPLEMENT_SWIZZLE(y, x, x, y) +IMPLEMENT_SWIZZLE(y, x, x, z) +IMPLEMENT_SWIZZLE(y, x, x, w) +IMPLEMENT_SWIZZLE(y, x, y, x) +IMPLEMENT_SWIZZLE(y, x, y, y) +IMPLEMENT_SWIZZLE(y, x, y, z) +IMPLEMENT_SWIZZLE(y, x, y, w) +IMPLEMENT_SWIZZLE(y, x, z, x) +IMPLEMENT_SWIZZLE(y, x, z, y) +IMPLEMENT_SWIZZLE(y, x, z, z) +IMPLEMENT_SWIZZLE(y, x, z, w) +IMPLEMENT_SWIZZLE(y, x, w, x) +IMPLEMENT_SWIZZLE(y, x, w, y) +IMPLEMENT_SWIZZLE(y, x, w, z) +IMPLEMENT_SWIZZLE(y, x, w, w) +IMPLEMENT_SWIZZLE(y, y, x, x) +IMPLEMENT_SWIZZLE(y, y, x, y) +IMPLEMENT_SWIZZLE(y, y, x, z) +IMPLEMENT_SWIZZLE(y, y, x, w) +IMPLEMENT_SWIZZLE(y, y, y, x) +IMPLEMENT_SWIZZLE(y, y, y, y) +IMPLEMENT_SWIZZLE(y, y, y, z) +IMPLEMENT_SWIZZLE(y, y, y, w) +IMPLEMENT_SWIZZLE(y, y, z, x) +IMPLEMENT_SWIZZLE(y, y, z, y) +IMPLEMENT_SWIZZLE(y, y, z, z) +IMPLEMENT_SWIZZLE(y, y, z, w) +IMPLEMENT_SWIZZLE(y, y, w, x) +IMPLEMENT_SWIZZLE(y, y, w, y) +IMPLEMENT_SWIZZLE(y, y, w, z) +IMPLEMENT_SWIZZLE(y, y, w, w) +IMPLEMENT_SWIZZLE(y, z, x, x) +IMPLEMENT_SWIZZLE(y, z, x, y) +IMPLEMENT_SWIZZLE(y, z, x, z) +IMPLEMENT_SWIZZLE(y, z, x, w) +IMPLEMENT_SWIZZLE(y, z, y, x) +IMPLEMENT_SWIZZLE(y, z, y, y) +IMPLEMENT_SWIZZLE(y, z, y, z) +IMPLEMENT_SWIZZLE(y, z, y, w) +IMPLEMENT_SWIZZLE(y, z, z, x) +IMPLEMENT_SWIZZLE(y, z, z, y) +IMPLEMENT_SWIZZLE(y, z, z, z) +IMPLEMENT_SWIZZLE(y, z, z, w) +IMPLEMENT_SWIZZLE(y, z, w, x) +IMPLEMENT_SWIZZLE(y, z, w, y) +IMPLEMENT_SWIZZLE(y, z, w, z) +IMPLEMENT_SWIZZLE(y, z, w, w) +IMPLEMENT_SWIZZLE(y, w, x, x) +IMPLEMENT_SWIZZLE(y, w, x, y) +IMPLEMENT_SWIZZLE(y, w, x, z) +IMPLEMENT_SWIZZLE(y, w, x, w) +IMPLEMENT_SWIZZLE(y, w, y, x) +IMPLEMENT_SWIZZLE(y, w, y, y) +IMPLEMENT_SWIZZLE(y, w, y, z) +IMPLEMENT_SWIZZLE(y, w, y, w) +IMPLEMENT_SWIZZLE(y, w, z, x) +IMPLEMENT_SWIZZLE(y, w, z, y) +IMPLEMENT_SWIZZLE(y, w, z, z) +IMPLEMENT_SWIZZLE(y, w, z, w) +IMPLEMENT_SWIZZLE(y, w, w, x) +IMPLEMENT_SWIZZLE(y, w, w, y) +IMPLEMENT_SWIZZLE(y, w, w, z) +IMPLEMENT_SWIZZLE(y, w, w, w) +IMPLEMENT_SWIZZLE(z, x, x, x) +IMPLEMENT_SWIZZLE(z, x, x, y) +IMPLEMENT_SWIZZLE(z, x, x, z) +IMPLEMENT_SWIZZLE(z, x, x, w) +IMPLEMENT_SWIZZLE(z, x, y, x) +IMPLEMENT_SWIZZLE(z, x, y, y) +IMPLEMENT_SWIZZLE(z, x, y, z) +IMPLEMENT_SWIZZLE(z, x, y, w) +IMPLEMENT_SWIZZLE(z, x, z, x) +IMPLEMENT_SWIZZLE(z, x, z, y) +IMPLEMENT_SWIZZLE(z, x, z, z) +IMPLEMENT_SWIZZLE(z, x, z, w) +IMPLEMENT_SWIZZLE(z, x, w, x) +IMPLEMENT_SWIZZLE(z, x, w, y) +IMPLEMENT_SWIZZLE(z, x, w, z) +IMPLEMENT_SWIZZLE(z, x, w, w) +IMPLEMENT_SWIZZLE(z, y, x, x) +IMPLEMENT_SWIZZLE(z, y, x, y) +IMPLEMENT_SWIZZLE(z, y, x, z) +IMPLEMENT_SWIZZLE(z, y, x, w) +IMPLEMENT_SWIZZLE(z, y, y, x) +IMPLEMENT_SWIZZLE(z, y, y, y) +IMPLEMENT_SWIZZLE(z, y, y, z) +IMPLEMENT_SWIZZLE(z, y, y, w) +IMPLEMENT_SWIZZLE(z, y, z, x) +IMPLEMENT_SWIZZLE(z, y, z, y) +IMPLEMENT_SWIZZLE(z, y, z, z) +IMPLEMENT_SWIZZLE(z, y, z, w) +IMPLEMENT_SWIZZLE(z, y, w, x) +IMPLEMENT_SWIZZLE(z, y, w, y) +IMPLEMENT_SWIZZLE(z, y, w, z) +IMPLEMENT_SWIZZLE(z, y, w, w) +IMPLEMENT_SWIZZLE(z, z, x, x) +IMPLEMENT_SWIZZLE(z, z, x, y) +IMPLEMENT_SWIZZLE(z, z, x, z) +IMPLEMENT_SWIZZLE(z, z, x, w) +IMPLEMENT_SWIZZLE(z, z, y, x) +IMPLEMENT_SWIZZLE(z, z, y, y) +IMPLEMENT_SWIZZLE(z, z, y, z) +IMPLEMENT_SWIZZLE(z, z, y, w) +IMPLEMENT_SWIZZLE(z, z, z, x) +IMPLEMENT_SWIZZLE(z, z, z, y) +IMPLEMENT_SWIZZLE(z, z, z, z) +IMPLEMENT_SWIZZLE(z, z, z, w) +IMPLEMENT_SWIZZLE(z, z, w, x) +IMPLEMENT_SWIZZLE(z, z, w, y) +IMPLEMENT_SWIZZLE(z, z, w, z) +IMPLEMENT_SWIZZLE(z, z, w, w) +IMPLEMENT_SWIZZLE(z, w, x, x) +IMPLEMENT_SWIZZLE(z, w, x, y) +IMPLEMENT_SWIZZLE(z, w, x, z) +IMPLEMENT_SWIZZLE(z, w, x, w) +IMPLEMENT_SWIZZLE(z, w, y, x) +IMPLEMENT_SWIZZLE(z, w, y, y) +IMPLEMENT_SWIZZLE(z, w, y, z) +IMPLEMENT_SWIZZLE(z, w, y, w) +IMPLEMENT_SWIZZLE(z, w, z, x) +IMPLEMENT_SWIZZLE(z, w, z, y) +IMPLEMENT_SWIZZLE(z, w, z, z) +IMPLEMENT_SWIZZLE(z, w, z, w) +IMPLEMENT_SWIZZLE(z, w, w, x) +IMPLEMENT_SWIZZLE(z, w, w, y) +IMPLEMENT_SWIZZLE(z, w, w, z) +IMPLEMENT_SWIZZLE(z, w, w, w) +IMPLEMENT_SWIZZLE(w, x, x, x) +IMPLEMENT_SWIZZLE(w, x, x, y) +IMPLEMENT_SWIZZLE(w, x, x, z) +IMPLEMENT_SWIZZLE(w, x, x, w) +IMPLEMENT_SWIZZLE(w, x, y, x) +IMPLEMENT_SWIZZLE(w, x, y, y) +IMPLEMENT_SWIZZLE(w, x, y, z) +IMPLEMENT_SWIZZLE(w, x, y, w) +IMPLEMENT_SWIZZLE(w, x, z, x) +IMPLEMENT_SWIZZLE(w, x, z, y) +IMPLEMENT_SWIZZLE(w, x, z, z) +IMPLEMENT_SWIZZLE(w, x, z, w) +IMPLEMENT_SWIZZLE(w, x, w, x) +IMPLEMENT_SWIZZLE(w, x, w, y) +IMPLEMENT_SWIZZLE(w, x, w, z) +IMPLEMENT_SWIZZLE(w, x, w, w) +IMPLEMENT_SWIZZLE(w, y, x, x) +IMPLEMENT_SWIZZLE(w, y, x, y) +IMPLEMENT_SWIZZLE(w, y, x, z) +IMPLEMENT_SWIZZLE(w, y, x, w) +IMPLEMENT_SWIZZLE(w, y, y, x) +IMPLEMENT_SWIZZLE(w, y, y, y) +IMPLEMENT_SWIZZLE(w, y, y, z) +IMPLEMENT_SWIZZLE(w, y, y, w) +IMPLEMENT_SWIZZLE(w, y, z, x) +IMPLEMENT_SWIZZLE(w, y, z, y) +IMPLEMENT_SWIZZLE(w, y, z, z) +IMPLEMENT_SWIZZLE(w, y, z, w) +IMPLEMENT_SWIZZLE(w, y, w, x) +IMPLEMENT_SWIZZLE(w, y, w, y) +IMPLEMENT_SWIZZLE(w, y, w, z) +IMPLEMENT_SWIZZLE(w, y, w, w) +IMPLEMENT_SWIZZLE(w, z, x, x) +IMPLEMENT_SWIZZLE(w, z, x, y) +IMPLEMENT_SWIZZLE(w, z, x, z) +IMPLEMENT_SWIZZLE(w, z, x, w) +IMPLEMENT_SWIZZLE(w, z, y, x) +IMPLEMENT_SWIZZLE(w, z, y, y) +IMPLEMENT_SWIZZLE(w, z, y, z) +IMPLEMENT_SWIZZLE(w, z, y, w) +IMPLEMENT_SWIZZLE(w, z, z, x) +IMPLEMENT_SWIZZLE(w, z, z, y) +IMPLEMENT_SWIZZLE(w, z, z, z) +IMPLEMENT_SWIZZLE(w, z, z, w) +IMPLEMENT_SWIZZLE(w, z, w, x) +IMPLEMENT_SWIZZLE(w, z, w, y) +IMPLEMENT_SWIZZLE(w, z, w, z) +IMPLEMENT_SWIZZLE(w, z, w, w) +IMPLEMENT_SWIZZLE(w, w, x, x) +IMPLEMENT_SWIZZLE(w, w, x, y) +IMPLEMENT_SWIZZLE(w, w, x, z) +IMPLEMENT_SWIZZLE(w, w, x, w) +IMPLEMENT_SWIZZLE(w, w, y, x) +IMPLEMENT_SWIZZLE(w, w, y, y) +IMPLEMENT_SWIZZLE(w, w, y, z) +IMPLEMENT_SWIZZLE(w, w, y, w) +IMPLEMENT_SWIZZLE(w, w, z, x) +IMPLEMENT_SWIZZLE(w, w, z, y) +IMPLEMENT_SWIZZLE(w, w, z, z) +IMPLEMENT_SWIZZLE(w, w, z, w) +IMPLEMENT_SWIZZLE(w, w, w, x) +IMPLEMENT_SWIZZLE(w, w, w, y) +IMPLEMENT_SWIZZLE(w, w, w, z) +IMPLEMENT_SWIZZLE(w, w, w, w) diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h new file mode 100644 index 0000000..bdeeb91 --- /dev/null +++ b/include/bx/float4_t.h @@ -0,0 +1,22 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4_T_H__ +#define __BX_FLOAT4_T_H__ + +#include +#include "bx.h" + +#define BX_FLOAT4_INLINE BX_FORCE_INLINE + +#if 0 // defined(__SSE2__) +# include "float4_sse.h" +#elif 0 // __ARM_NEON__ +# include "float4_neon.h" +#else +# include "float4_ref.h" +#endif // + +#endif // __BX_FLOAT4_T_H__ diff --git a/include/bx/float4x4_t.h b/include/bx/float4x4_t.h new file mode 100644 index 0000000..a552425 --- /dev/null +++ b/include/bx/float4x4_t.h @@ -0,0 +1,168 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FLOAT4X4_H__ +#define __BX_FLOAT4x4_H__ + +#include "float4_t.h" + +namespace bx +{ + typedef BX_ALIGN_STRUCT_16(struct) + { + float4_t col[4]; + + } float4x4_t; + + BX_FLOAT4_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t& _b) + { + const float4_t xxxx = float4_swiz_xxxx(_a); + const float4_t yyyy = float4_swiz_yyyy(_a); + const float4_t zzzz = float4_swiz_zzzz(_a); + const float4_t col0 = float4_mul(_b.col[0], xxxx); + const float4_t col1 = float4_mul(_b.col[1], yyyy); + const float4_t col2 = float4_madd(_b.col[2], zzzz, col0); + const float4_t col3 = float4_add(_b.col[3], col1); + const float4_t result = float4_add(col2, col3); + + return result; + } + + BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, const float4x4_t& _b) + { + const float4_t xxxx = float4_swiz_xxxx(_a); + const float4_t yyyy = float4_swiz_yyyy(_a); + const float4_t zzzz = float4_swiz_zzzz(_a); + const float4_t wwww = float4_swiz_wwww(_a); + const float4_t col0 = float4_mul(_b.col[0], xxxx); + const float4_t col1 = float4_mul(_b.col[1], yyyy); + const float4_t col2 = float4_madd(_b.col[2], zzzz, col0); + const float4_t col3 = float4_madd(_b.col[3], wwww, col1); + const float4_t result = float4_add(col2, col3); + + return result; + } + + BX_FLOAT4_INLINE float4x4_t float4x4_mul(const float4x4_t& _a, const float4x4_t& _b) + { + float4x4_t result; + result.col[0] = float4_mul(_a.col[0], _b); + result.col[1] = float4_mul(_a.col[1], _b); + result.col[2] = float4_mul(_a.col[2], _b); + result.col[3] = float4_mul(_a.col[3], _b); + + return result; + } + + BX_FLOAT4_INLINE float4x4_t float4x4_transpose(const float4x4_t& _mtx) + { + const float4_t aibj = float4_shuf_xAyB(_mtx.col[0], _mtx.col[2]); // aibj + const float4_t emfn = float4_shuf_xAyB(_mtx.col[1], _mtx.col[3]); // emfn + const float4_t ckdl = float4_shuf_zCwD(_mtx.col[0], _mtx.col[2]); // ckdl + const float4_t gohp = float4_shuf_zCwD(_mtx.col[1], _mtx.col[3]); // gohp + float4x4_t result; + result.col[0] = float4_shuf_xAyB(aibj, emfn); // aeim + result.col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn + result.col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko + result.col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp + + return result; + } + + BX_FLOAT4_INLINE float4x4_t float4x4_inverse(const float4x4_t& _a) + { + const float4_t tmp0 = float4_shuf_xAzC(_a.col[0], _a.col[1]); + const float4_t tmp1 = float4_shuf_xAzC(_a.col[2], _a.col[3]); + const float4_t tmp2 = float4_shuf_yBwD(_a.col[0], _a.col[1]); + const float4_t tmp3 = float4_shuf_yBwD(_a.col[2], _a.col[3]); + const float4_t t0 = float4_shuf_xyAB(tmp0, tmp1); + const float4_t t1 = float4_shuf_xyAB(tmp3, tmp2); + const float4_t t2 = float4_shuf_zwCD(tmp0, tmp1); + const float4_t t3 = float4_shuf_zwCD(tmp3, tmp2); + + const float4_t t23 = float4_mul(t2, t3); + const float4_t t23_yxwz = float4_swiz_yxwz(t23); + const float4_t t23_wzyx = float4_swiz_wzyx(t23); + + float4_t cof0, cof1, cof2, cof3; + + const float4_t zero = float4_zero(); + cof0 = float4_nmsub(t1, t23_yxwz, zero); + cof0 = float4_madd(t1, t23_wzyx, cof0); + + cof1 = float4_nmsub(t0, t23_yxwz, zero); + cof1 = float4_madd(t0, t23_wzyx, cof1); + cof1 = float4_swiz_zwxy(cof1); + + const float4_t t12 = float4_mul(t1, t2); + const float4_t t12_yxwz = float4_swiz_yxwz(t12); + const float4_t t12_wzyx = float4_swiz_wzyx(t12); + + cof0 = float4_madd(t3, t12_yxwz, cof0); + cof0 = float4_nmsub(t3, t12_wzyx, cof0); + + cof3 = float4_mul(t0, t12_yxwz); + cof3 = float4_nmsub(t0, t12_wzyx, cof3); + cof3 = float4_swiz_zwxy(cof3); + + const float4_t t1_zwxy = float4_swiz_zwxy(t1); + const float4_t t2_zwxy = float4_swiz_zwxy(t2); + + const float4_t t13 = float4_mul(t1_zwxy, t3); + const float4_t t13_yxwz = float4_swiz_yxwz(t13); + const float4_t t13_wzyx = float4_swiz_wzyx(t13); + + cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0); + cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0); + + cof2 = float4_mul(t0, t13_yxwz); + cof2 = float4_nmsub(t0, t13_wzyx, cof2); + cof2 = float4_swiz_zwxy(cof2); + + const float4_t t01 = float4_mul(t0, t1); + const float4_t t01_yxwz = float4_swiz_yxwz(t01); + const float4_t t01_wzyx = float4_swiz_wzyx(t01); + + cof2 = float4_nmsub(t3, t01_yxwz, cof2); + cof2 = float4_madd(t3, t01_wzyx, cof2); + + cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3); + cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3); + + const float4_t t03 = float4_mul(t0, t3); + const float4_t t03_yxwz = float4_swiz_yxwz(t03); + const float4_t t03_wzyx = float4_swiz_wzyx(t03); + + cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1); + cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1); + + cof2 = float4_madd(t1, t03_yxwz, cof2); + cof2 = float4_nmsub(t1, t03_wzyx, cof2); + + const float4_t t02 = float4_mul(t0, t2_zwxy); + const float4_t t02_yxwz = float4_swiz_yxwz(t02); + const float4_t t02_wzyx = float4_swiz_wzyx(t02); + + cof1 = float4_madd(t3, t02_yxwz, cof1); + cof1 = float4_nmsub(t3, t02_wzyx, cof1); + + cof3 = float4_nmsub(t1, t02_yxwz, cof3); + cof3 = float4_madd(t1, t02_wzyx, cof3); + + const float4_t det = float4_dot(t0, cof0); + const float4_t invdet = float4_rcp(det); + + float4x4_t result; + result.col[0] = float4_mul(cof0, invdet); + result.col[1] = float4_mul(cof1, invdet); + result.col[2] = float4_mul(cof2, invdet); + result.col[3] = float4_mul(cof3, invdet); + + return result; + } + +} // namespace bx + +#endif // __BX_FLOAT4X4_H__ diff --git a/include/bx/foreach.h b/include/bx/foreach.h new file mode 100644 index 0000000..f30315b --- /dev/null +++ b/include/bx/foreach.h @@ -0,0 +1,71 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_FOREACH_H__ +#define __BX_FOREACH_H__ + +#include "bx.h" + +namespace bx +{ + namespace foreach_ns + { + struct ContainerBase + { + }; + + template + class Container : public ContainerBase + { + public: + inline Container(const Ty& _container) + : m_container(_container) + , m_break(0) + , m_it( _container.begin() ) + , m_itEnd( _container.end() ) + { + } + + inline bool condition() const + { + return (!m_break++ && m_it != m_itEnd); + } + + const Ty& m_container; + mutable int m_break; + mutable typename Ty::const_iterator m_it; + mutable typename Ty::const_iterator m_itEnd; + }; + + template + inline Ty* pointer(const Ty&) + { + return 0; + } + + template + inline Container containerNew(const Ty& _container) + { + return Container(_container); + } + + template + inline const Container* container(const ContainerBase* _base, const Ty*) + { + return static_cast*>(_base); + } + } // namespace foreach_ns + +#define foreach(_variable, _container) \ + for (const bx::foreach_ns::ContainerBase &__temp_container__ = bx::foreach_ns::containerNew(_container); \ + bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->condition(); \ + ++bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_it) \ + for (_variable = *container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_it; \ + bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_break; \ + --bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_break) + +} // namespace bx + +#endif // __BX_FOREACH_H__ diff --git a/include/bx/handlealloc.h b/include/bx/handlealloc.h new file mode 100644 index 0000000..cd6f54b --- /dev/null +++ b/include/bx/handlealloc.h @@ -0,0 +1,83 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_HANDLE_ALLOC_H__ +#define __BX_HANDLE_ALLOC_H__ + +#include "bx.h" + +namespace bx +{ + class HandleAlloc + { + public: + static const uint16_t invalid = 0xffff; + + HandleAlloc(uint16_t _maxHandles) + : m_dense(new uint16_t[_maxHandles*2]) + , m_sparse(&m_dense[_maxHandles]) + , m_numHandles(0) + , m_maxHandles(_maxHandles) + { + for (uint16_t ii = 0; ii < _maxHandles; ++ii) + { + m_dense[ii] = ii; + } + } + + ~HandleAlloc() + { + delete [] m_dense; + } + + const uint16_t* getHandles() const + { + return m_dense; + } + + uint16_t getNumHandles() const + { + return m_numHandles; + } + + uint16_t getMaxHandles() const + { + return m_maxHandles; + } + + uint16_t alloc() + { + if (m_numHandles < m_maxHandles) + { + uint16_t index = m_numHandles; + ++m_numHandles; + + uint16_t handle = m_dense[index]; + m_sparse[handle] = index; + return handle; + } + + return invalid; + } + + void free(uint16_t _handle) + { + uint16_t index = m_sparse[_handle]; + --m_numHandles; + uint16_t temp = m_dense[m_numHandles]; + m_dense[m_numHandles] = _handle; + m_sparse[temp] = index; + m_dense[index] = temp; + } + + private: + uint16_t* m_dense; + uint16_t* m_sparse; + uint16_t m_numHandles; + uint16_t m_maxHandles; + }; +} // namespace bx + +#endif // __HANDLE_ALLOC_H__ diff --git a/include/bx/hash.h b/include/bx/hash.h new file mode 100644 index 0000000..1722515 --- /dev/null +++ b/include/bx/hash.h @@ -0,0 +1,90 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_HASH_H__ +#define __BX_HASH_H__ + +#include "bx.h" + +namespace bx +{ + // MurmurHash2 was written by Austin Appleby, and is placed in the public + // domain. The author hereby disclaims copyright to this source code. + #define MURMUR_M 0x5bd1e995 + #define MURMUR_R 24 + + #define mmix(_h, _k) { _k *= MURMUR_M; _k ^= _k >> MURMUR_R; _k *= MURMUR_M; _h *= MURMUR_M; _h ^= _k; } + + class HashMurmur2A + { + public: + void begin(uint32_t _seed = 0) + { + m_hash = _seed; + m_tail = 0; + m_count = 0; + m_size = 0; + } + + void add(const void* _data, int _len) + { + const uint8_t* data = (uint8_t*)_data; + m_size += _len; + + mixTail(data, _len); + + while(_len >= 4) + { + uint32_t kk = *(uint32_t*)data; + + mmix(m_hash, kk); + + data += 4; + _len -= 4; + } + + mixTail(data, _len); + } + + uint32_t end() + { + mmix(m_hash, m_tail); + mmix(m_hash, m_size); + + m_hash ^= m_hash >> 13; + m_hash *= MURMUR_M; + m_hash ^= m_hash >> 15; + + return m_hash; + } + + private: + void mixTail(const uint8_t*& _data, int& _len) + { + while( _len && ((_len<4) || m_count) ) + { + m_tail |= (*_data++) << (m_count * 8); + + m_count++; + _len--; + + if(m_count == 4) + { + mmix(m_hash, m_tail); + m_tail = 0; + m_count = 0; + } + } + } + + uint32_t m_hash; + uint32_t m_tail; + uint32_t m_count; + uint32_t m_size; + }; + +} // namespace bx + +#endif // __BX_HASH_H__ diff --git a/include/bx/macros.h b/include/bx/macros.h new file mode 100644 index 0000000..9f8253d --- /dev/null +++ b/include/bx/macros.h @@ -0,0 +1,62 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_MACROS_H__ +#define __BX_MACROS_H__ + +#include "bx.h" + +#define BX_VA_ARGS_COUNT_DETAIL(_a1, _a2, _a3, _a4, _a5, _a6, _a7, _a8, _a9, _a10, _a11, _a12, _a13, _a14, _a15, _a16, _last, ...) _last +#define BX_VA_ARGS_COUNT(...) BX_VA_ARGS_COUNT_DETAIL(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) + +#define BX_MACRO_DISPATCHER_DETAIL1(_func, _argCount) _func ## _argCount +#define BX_MACRO_DISPATCHER_DETAIL2(_func, _argCount) BX_MACRO_DISPATCHER_DETAIL1(_func, _argCount) +#define BX_MACRO_DISPATCHER(_func, ...) BX_MACRO_DISPATCHER_DETAIL2(_func, VA_ARGS_COUNT(__VA_ARGS__) ) + +#define BX_STRINGIZE(_x) BX_STRINGIZE_(_x) +#define BX_STRINGIZE_(_x) #_x + +#define BX_FILE_LINE_LITERAL "" __FILE__ "(" BX_STRINGIZE(__LINE__) "): " + +#define BX_ALIGN_MASK(_value, _mask) ( ( (_value)+(_mask) ) & ( (~0)&(~(_mask) ) ) ) +#define BX_ALIGN_16(_value) BX_ALIGN_MASK(_value, 0xf) +#define BX_ALIGN_256(_value) BX_ALIGN_MASK(_value, 0xff) + +#if BX_COMPILER_GCC || BX_COMPILER_CLANG +# define BX_ALIGN_STRUCT(_align, _struct) _struct __attribute__( (aligned(_align) ) ) +# define BX_FUNCTION __PRETTY_FUNCTION__ +# define BX_NO_INLINE __attribute__( (noinline) ) +# define BX_FORCE_INLINE __extension__ static __inline __attribute__( (__always_inline__) ) +# if BX_COMPILER_CLANG +# define BX_THREAD /* not supported right now */ +# else +# define BX_THREAD __thread +# endif // BX_COMPILER_CLANG +#elif BX_COMPILER_MSVC +# define BX_ALIGN_STRUCT(_align, _struct) __declspec(align(_align) ) _struct +# define BX_FUNCTION __FUNCTION__ +# define BX_NO_INLINE __declspec(noinline) +# define BX_FORCE_INLINE __forceinline +# define BX_THREAD __declspec(thread) +#else +# error "Unknown BX_COMPILER_?" +#endif + +#define BX_ALIGN_STRUCT_16(_struct) BX_ALIGN_STRUCT(16, _struct) +#define BX_ALIGN_STRUCT_256(_struct) BX_ALIGN_STRUCT(256, _struct) + +#ifndef BX_CHECK +# define BX_CHECK(...) do {} while(0) +#endif // BX_CHECK + +#ifndef BX_TRACE +# define BX_TRACE(...) do {} while(0) +#endif // BX_TRACE + +#ifndef BX_CONFIG_SPSCQUEUE_USE_NAIVE +# define BX_CONFIG_SPSCQUEUE_USE_NAIVE 0 +#endif // BX_CONFIG_SPSCQUEUE_USE_NAIVE + +#endif // __BX_MACROS_H__ diff --git a/include/bx/maputil.h b/include/bx/maputil.h new file mode 100644 index 0000000..daa7248 --- /dev/null +++ b/include/bx/maputil.h @@ -0,0 +1,29 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_MAPUTIL_H__ +#define __BX_MAPUTIL_H__ + +#include "bx.h" + +namespace bx +{ + template + typename MapType::iterator mapInsertOrUpdate(MapType& _map, const typename MapType::key_type& _key, const typename MapType::mapped_type& _value) + { + typename MapType::iterator it = _map.lower_bound(_key); + if (it != _map.end() + && !_map.key_comp()(_key, it->first) ) + { + it->second = _value; + return it; + } + + typename MapType::value_type pair(_key, _value); + return _map.insert(it, pair); + } +} // namespace bx + +#endif // __BX_MAPUTIL_H__ diff --git a/include/bx/mutex.h b/include/bx/mutex.h new file mode 100644 index 0000000..7c1e906 --- /dev/null +++ b/include/bx/mutex.h @@ -0,0 +1,171 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_MUTEX_H__ +#define __BX_MUTEX_H__ + +#include "bx.h" +#include "cpu.h" +#include "sem.h" + +#if BX_PLATFORM_NACL || BX_PLATFORM_LINUX || BX_PLATFORM_ANDROID +# include +#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 +# include +#endif // BX_PLATFORM_ + +namespace bx +{ +#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 + typedef CRITICAL_SECTION pthread_mutex_t; + typedef unsigned pthread_mutexattr_t; + + inline int pthread_mutex_lock(pthread_mutex_t* _mutex) + { + EnterCriticalSection(_mutex); + return 0; + } + + inline int pthread_mutex_unlock(pthread_mutex_t* _mutex) + { + LeaveCriticalSection(_mutex); + return 0; + } + + inline int pthread_mutex_trylock(pthread_mutex_t* _mutex) + { + return TryEnterCriticalSection(_mutex) ? 0 : EBUSY; + } + + inline int pthread_mutex_init(pthread_mutex_t* _mutex, pthread_mutexattr_t* /*_attr*/) + { + InitializeCriticalSection(_mutex); + return 0; + } + + inline int pthread_mutex_destroy(pthread_mutex_t* _mutex) + { + DeleteCriticalSection(_mutex); + return 0; + } +#endif // BX_PLATFORM_ + + class Mutex + { + public: + Mutex() + { + pthread_mutex_init(&m_handle, NULL); + } + + ~Mutex() + { + pthread_mutex_destroy(&m_handle); + } + + void lock() + { + pthread_mutex_lock(&m_handle); + } + + void unlock() + { + pthread_mutex_unlock(&m_handle); + } + + private: + Mutex(const Mutex& _rhs); // no copy constructor + Mutex& operator=(const Mutex& _rhs); // no assignment operator + + pthread_mutex_t m_handle; + }; + + class MutexScope + { + public: + MutexScope(Mutex& _mutex) + : m_mutex(_mutex) + { + m_mutex.lock(); + } + + ~MutexScope() + { + m_mutex.unlock(); + } + + private: + MutexScope(); // no default constructor + MutexScope(const MutexScope& _rhs); // no copy constructor + MutexScope& operator=(const MutexScope& _rhs); // no assignment operator + + Mutex& m_mutex; + }; + +#if 1 + typedef Mutex LwMutex; +#else + class LwMutex + { + public: + LwMutex() + : m_count(0) + { + } + + ~LwMutex() + { + } + + void lock() + { + if (atomicIncr(&m_count) > 1) + { + m_sem.wait(); + } + } + + void unlock() + { + if (atomicDecr(&m_count) > 0) + { + m_sem.post(); + } + } + + private: + LwMutex(const LwMutex& _rhs); // no copy constructor + LwMutex& operator=(const LwMutex& _rhs); // no assignment operator + + Semaphore m_sem; + volatile int32_t m_count; + }; +#endif // 0 + + class LwMutexScope + { + public: + LwMutexScope(LwMutex& _mutex) + : m_mutex(_mutex) + { + m_mutex.lock(); + } + + ~LwMutexScope() + { + m_mutex.unlock(); + } + + private: + LwMutexScope(); // no default constructor + LwMutexScope(const LwMutexScope& _rhs); // no copy constructor + LwMutexScope& operator=(const LwMutexScope& _rhs); // no assignment operator + + LwMutex& m_mutex; + }; + +} // namespace bx + +#endif // __BX_MUTEX_H__ diff --git a/include/bx/os.h b/include/bx/os.h new file mode 100644 index 0000000..ed0ced4 --- /dev/null +++ b/include/bx/os.h @@ -0,0 +1,46 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_OS_H__ +#define __BX_OS_H__ + +#include "bx.h" + +#if BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX +# include // sched_yield +# if BX_PLATFORM_NACL +# include // nanosleep +# else +# include // nanosleep +# endif // BX_PLATFORM_NACL +#endif // BX_PLATFORM_ + +namespace bx +{ + inline void sleep(uint32_t _ms) + { +#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 + Sleep(_ms); +#else + timespec req = {_ms/1000, (_ms%1000)*1000000}; + timespec rem = {0, 0}; + nanosleep(&req, &rem); +#endif // BX_PLATFORM_ + } + + inline void yield() + { +#if BX_PLATFORM_WINDOWS + SwitchToThread(); +#elif BX_PLATFORM_XBOX360 + Sleep(0); +#else + sched_yield(); +#endif // BX_PLATFORM_ + } + +} // namespace bx + +#endif // __BX_OS_H__ diff --git a/include/bx/platform.h b/include/bx/platform.h new file mode 100644 index 0000000..c4d92cb --- /dev/null +++ b/include/bx/platform.h @@ -0,0 +1,86 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_PLATFORM_H__ +#define __BX_PLATFORM_H__ + +#define BX_COMPILER_CLANG 0 +#define BX_COMPILER_GCC 0 +#define BX_COMPILER_MSVC 0 + +#define BX_PLATFORM_ANDROID 0 +#define BX_PLATFORM_LINUX 0 +#define BX_PLATFORM_NACL 0 +#define BX_PLATFORM_WINDOWS 0 +#define BX_PLATFORM_XBOX360 0 + +#define BX_CPU_ARM 0 +#define BX_CPU_PPC 0 +#define BX_CPU_X86 0 + +#define BX_CPU_ENDIAN_BIG 0 +#define BX_CPU_ENDIAN_LITTLE 0 + +// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Compilers +#if defined(_MSC_VER) +# undef BX_COMPILER_MSVC +# define BX_COMPILER_MSVC 1 +#elif defined(__clang__) +// clang defines __GNUC__ +# undef BX_COMPILER_CLANG +# define BX_COMPILER_CLANG 1 +#elif defined(__GNUC__) +# undef BX_COMPILER_GCC +# define BX_COMPILER_GCC 1 +#else +# error "BX_COMPILER_* is not defined!" +#endif // + +// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Operating_Systems +#if defined(_XBOX_VER) +# undef BX_PLATFORM_XBOX360 +# define BX_PLATFORM_XBOX360 1 +#elif defined(_WIN32) || defined(_WIN64) +# undef BX_PLATFORM_WINDOWS +# define BX_PLATFORM_WINDOWS 1 +#elif defined(__native_client__) +// NaCl compiler defines __linux__ +# undef BX_PLATFORM_NACL +# define BX_PLATFORM_NACL 1 +#elif defined(__ANDROID__) +// Android compiler defines __linux__ +# undef BX_PLATFORM_ANDROID +# define BX_PLATFORM_ANDROID 1 +#elif defined(__linux__) +# undef BX_PLATFORM_LINUX +# define BX_PLATFORM_LINUX 1 +#else +# error "BX_PLATFORM_* is not defined!" +#endif // + +// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Architectures +#if defined(__arm__) +# undef BX_CPU_ARM +# define BX_CPU_ARM 1 +# define BX_CACHE_LINE_SIZE 64 +#elif defined(_M_PPC) || defined(__powerpc__) || defined(__powerpc64__) +# undef BX_CPU_PPC +# define BX_CPU_PPC 1 +# define BX_CACHE_LINE_SIZE 128 +#elif defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +# undef BX_CPU_X86 +# define BX_CPU_X86 1 +# define BX_CACHE_LINE_SIZE 64 +#endif // + +#if BX_CPU_PPC +# undef BX_CPU_ENDIAN_BIG +# define BX_CPU_ENDIAN_BIG 1 +#else +# undef BX_CPU_ENDIAN_LITTLE +# define BX_CPU_ENDIAN_LITTLE 1 +#endif // BX_PLATFORM_ + +#endif // __BX_PLATFORM_H__ diff --git a/include/bx/ringbuffer.h b/include/bx/ringbuffer.h new file mode 100644 index 0000000..44bf849 --- /dev/null +++ b/include/bx/ringbuffer.h @@ -0,0 +1,313 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_RINGBUFFER_H__ +#define __BX_RINGBUFFER_H__ + +#include "bx.h" +#include "cpu.h" +#include "uint32_t.h" + +namespace bx +{ + class RingBufferControl + { + public: + RingBufferControl(uint32_t _size) + : m_size(_size) + , m_current(0) + , m_write(0) + , m_read(0) + { + } + + ~RingBufferControl() + { + } + + uint32_t available() const + { + return distance(m_read, m_current); + } + + uint32_t consume(uint32_t _size) // consumer only + { + const uint32_t maxSize = distance(m_read, m_current); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_read, size); + const uint32_t read = uint32_mod(advance, m_size); + m_read = read; + return size; + } + + uint32_t reserve(uint32_t _size) // producer only + { + const uint32_t dist = distance(m_write, m_read)-1; + const uint32_t maxSize = uint32_sels(dist, m_size-1, dist); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_write, size); + const uint32_t write = uint32_mod(advance, m_size); + m_write = write; + return size; + } + + uint32_t commit(uint32_t _size) // producer only + { + const uint32_t maxSize = distance(m_current, m_write); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_current, size); + const uint32_t current = uint32_mod(advance, m_size); + m_current = current; + return size; + } + + uint32_t distance(uint32_t _from, uint32_t _to) const // both + { + const uint32_t diff = uint32_sub(_to, _from); + const uint32_t le = uint32_add(m_size, diff); + const uint32_t result = uint32_sels(diff, le, diff); + + return result; + } + + const uint32_t m_size; + uint32_t m_current; + uint32_t m_write; + uint32_t m_read; + }; + + class SpScRingBufferControl + { + public: + SpScRingBufferControl(uint32_t _size) + : m_size(_size) + , m_current(0) + , m_write(0) + , m_read(0) + { + } + + ~SpScRingBufferControl() + { + } + + uint32_t available() const + { + return distance(m_read, m_current); + } + + uint32_t consume(uint32_t _size) // consumer only + { + const uint32_t maxSize = distance(m_read, m_current); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_read, size); + const uint32_t read = uint32_mod(advance, m_size); + m_read = read; + return size; + } + + uint32_t reserve(uint32_t _size) // producer only + { + const uint32_t dist = distance(m_write, m_read)-1; + const uint32_t maxSize = uint32_sels(dist, m_size-1, dist); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_write, size); + const uint32_t write = uint32_mod(advance, m_size); + m_write = write; + return size; + } + + uint32_t commit(uint32_t _size) // producer only + { + const uint32_t maxSize = distance(m_current, m_write); + const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF); + const uint32_t test = uint32_sub(sizeNoSign, maxSize); + const uint32_t size = uint32_sels(test, _size, maxSize); + const uint32_t advance = uint32_add(m_current, size); + const uint32_t current = uint32_mod(advance, m_size); + + // must commit all memory writes before moving m_current pointer + // once m_current pointer moves data is used by consumer thread + memoryBarrier(); + m_current = current; + return size; + } + + uint32_t distance(uint32_t _from, uint32_t _to) const // both + { + const uint32_t diff = uint32_sub(_to, _from); + const uint32_t le = uint32_add(m_size, diff); + const uint32_t result = uint32_sels(diff, le, diff); + + return result; + } + + const uint32_t m_size; + uint32_t m_current; + uint32_t m_write; + uint32_t m_read; + }; + + template + class ReadRingBufferT + { + public: + ReadRingBufferT(Control& _control, const char* _buffer, uint32_t _size) + : m_control(_control) + , m_read(_control.m_read) + , m_end(m_read+_size) + , m_size(_size) + , m_buffer(_buffer) + { + BX_CHECK(_control.available() >= _size, "%d >= %d", _control.available(), _size); + } + + ~ReadRingBufferT() + { + } + + void end() + { + m_control.consume(m_size); + } + + void read(char* _data, uint32_t _len) + { + const uint32_t end = (m_read + _len) % m_control.m_size; + uint32_t wrap = 0; + const char* from = &m_buffer[m_read]; + + if (end < m_read) + { + wrap = m_control.m_size - m_read; + memcpy(_data, from, wrap); + _data += wrap; + from = (const char*)&m_buffer[0]; + } + + memcpy(_data, from, _len-wrap); + + m_read = end; + } + + void skip(uint32_t _len) + { + m_read += _len; + m_read %= m_control.m_size; + } + + private: + template + friend class WriteRingBufferT; + + ReadRingBufferT(); + ReadRingBufferT(const Control&); + void operator=(const Control&); + + Control& m_control; + uint32_t m_read; + uint32_t m_end; + const uint32_t m_size; + const char* m_buffer; + }; + + typedef ReadRingBufferT ReadRingBuffer; + typedef ReadRingBufferT SpScReadRingBuffer; + + template + class WriteRingBufferT + { + public: + WriteRingBufferT(Control& _control, char* _buffer, uint32_t _size) + : m_control(_control) + , m_size(_size) + , m_buffer(_buffer) + { + uint32_t size = m_control.reserve(_size); + BX_CHECK(size == _size, "%d == %d", size, _size); + m_write = m_control.m_current; + m_end = m_write+_size; + } + + ~WriteRingBufferT() + { + } + + void end() + { + m_control.commit(m_size); + } + + void write(const char* _data, uint32_t _len) + { + const uint32_t end = (m_write + _len) % m_control.m_size; + uint32_t wrap = 0; + char* to = &m_buffer[m_write]; + + if (end < m_write) + { + wrap = m_control.m_size - m_write; + memcpy(to, _data, wrap); + _data += wrap; + to = (char*)&m_buffer[0]; + } + + memcpy(to, _data, _len-wrap); + + m_write = end; + } + + void write(ReadRingBufferT& _read, uint32_t _len) + { + const uint32_t end = (_read.m_read + _len) % _read.m_control.m_size; + uint32_t wrap = 0; + const char* from = &_read.m_buffer[_read.m_read]; + + if (end < _read.m_read) + { + wrap = _read.m_control.m_size - _read.m_read; + write(from, wrap); + from = (const char*)&_read.m_buffer[0]; + } + + write(from, _len-wrap); + + _read.m_read = end; + } + + void skip(uint32_t _len) + { + m_write += _len; + m_write %= m_control.m_size; + } + + private: + WriteRingBufferT(); + WriteRingBufferT(const WriteRingBufferT&); + void operator=(const WriteRingBufferT&); + + Control& m_control; + uint32_t m_write; + uint32_t m_end; + const uint32_t m_size; + char* m_buffer; + }; + + typedef WriteRingBufferT WriteRingBuffer; + typedef WriteRingBufferT SpScWriteRingBuffer; + +} // namespace bx + +#endif // __BX_RINGBUFFER_H__ diff --git a/include/bx/rng.h b/include/bx/rng.h new file mode 100644 index 0000000..1bcd894 --- /dev/null +++ b/include/bx/rng.h @@ -0,0 +1,97 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_RNG_H__ +#define __BX_RNG_H__ + +#include "bx.h" + +namespace bx +{ + // George Marsaglia's MWC + class RngMwc + { + public: + RngMwc(uint32_t _z = 12345, uint32_t _w = 65435) + : m_z(_z) + , m_w(_w) + { + } + + void reset(uint32_t _z = 12345, uint32_t _w = 65435) + { + m_z = _z; + m_w = _w; + } + + uint32_t gen() + { + m_z = 36969*(m_z&65535)+(m_z>>16); + m_w = 18000*(m_w&65535)+(m_w>>16); + return (m_z<<16)+m_w; + } + + private: + uint32_t m_z; + uint32_t m_w; + }; + + // George Marsaglia's FIB + class RngFib + { + public: + RngFib() + : m_a(9983651) + , m_b(95746118) + { + } + + void reset() + { + m_a = 9983651; + m_b = 95746118; + } + + uint32_t gen() + { + m_b = m_a+m_b; + m_a = m_b-m_a; + return m_a; + } + + private: + uint32_t m_a; + uint32_t m_b; + }; + + // George Marsaglia's SHR3 + class RngShr3 + { + public: + RngShr3(uint32_t _jsr = 34221) + : m_jsr(_jsr) + { + } + + void reset(uint32_t _jsr = 34221) + { + m_jsr = _jsr; + } + + uint32_t gen() + { + m_jsr ^= m_jsr<<17; + m_jsr ^= m_jsr>>13; + m_jsr ^= m_jsr<<5; + return m_jsr; + } + + private: + uint32_t m_jsr; + }; + +} // namespace bx + +#endif // __BX_RNG_H__ diff --git a/include/bx/sem.h b/include/bx/sem.h new file mode 100644 index 0000000..8ca50ab --- /dev/null +++ b/include/bx/sem.h @@ -0,0 +1,107 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_SEM_H__ +#define __BX_SEM_H__ + +#include "bx.h" + +#define BX_SEM_CONFIG_POSIX (BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX) + +#if BX_SEM_CONFIG_POSIX +# include +# include +#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 +# include +#endif // BX_PLATFORM_ + +namespace bx +{ +#if BX_SEM_CONFIG_POSIX + class Semaphore + { + public: + Semaphore() + { + sem_init(&m_handle, 0, 0); + } + + ~Semaphore() + { + sem_destroy(&m_handle); + } + + void post(uint32_t _count = 1) + { + for (uint32_t ii = 0; ii < _count; ++ii) + { + sem_post(&m_handle); + } + } + + bool wait(int32_t _msecs = -1) + { +#if BX_PLATFORM_NACL + BX_CHECK(-1 == _msecs, "NaCl doesn't support sem_timedwait at this moment."); + return 0 == sem_wait(&m_handle); +#else + if (0 > _msecs) + { + return 0 == sem_wait(&m_handle); + } + + timespec ts; + ts.tv_sec = _msecs/1000; + ts.tv_nsec = (_msecs%1000)*1000; + return 0 == sem_timedwait(&m_handle, &ts); +#endif // BX_PLATFORM_ + } + + private: + Semaphore(const Semaphore& _rhs); // no copy constructor + Semaphore& operator=(const Semaphore& _rhs); // no assignment operator + + sem_t m_handle; + }; + +#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 + + class Semaphore + { + public: + Semaphore() + { + m_handle = CreateSemaphore(NULL, 0, LONG_MAX, NULL); + BX_CHECK(NULL != m_handle, "Failed to create Semaphore!"); + } + + ~Semaphore() + { + CloseHandle(m_handle); + } + + void post(uint32_t _count = 1) const + { + ReleaseSemaphore(m_handle, _count, NULL); + } + + bool wait(int32_t _msecs = -1) const + { + DWORD milliseconds = (0 > _msecs) ? INFINITE : _msecs; + return WAIT_OBJECT_0 == WaitForSingleObject(m_handle, milliseconds); + } + + private: + Semaphore(const Semaphore& _rhs); // no copy constructor + Semaphore& operator=(const Semaphore& _rhs); // no assignment operator + + HANDLE m_handle; + }; + +#endif // BX_PLATFORM_ + +} // namespace bx + +#endif // __BX_SEM_H__ diff --git a/include/bx/spscqueue.h b/include/bx/spscqueue.h new file mode 100644 index 0000000..700834e --- /dev/null +++ b/include/bx/spscqueue.h @@ -0,0 +1,152 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_SPSCQUEUE_H__ +#define __BX_SPSCQUEUE_H__ + +#include + +#include "bx.h" +#include "cpu.h" +#include "mutex.h" +#include "uint32_t.h" + +namespace bx +{ + // http://drdobbs.com/article/print?articleId=210604448&siteSectionName= + template + class SpScUnboundedQueueOptimized + { + public: + SpScUnboundedQueueOptimized() + : m_first(new Node(NULL) ) + , m_divider(m_first) + , m_last(m_first) + { + } + + ~SpScUnboundedQueueOptimized() + { + while (NULL != m_first) + { + Node* node = m_first; + m_first = node->m_next; + delete node; + } + } + + void push(Ty* _ptr) // producer only + { + m_last->m_next = new Node(_ptr); + atomicExchangePtr((void**)&m_last, m_last->m_next); + while (m_first != m_divider) + { + Node* node = m_first; + m_first = m_first->m_next; + delete node; + } + } + + Ty* peek() // consumer only + { + if (m_divider != m_last) + { + Ty* ptr = m_divider->m_next->m_ptr; + return ptr; + } + + return NULL; + } + + Ty* pop() // consumer only + { + if (m_divider != m_last) + { + Ty* ptr = m_divider->m_next->m_ptr; + atomicExchangePtr((void**)&m_divider, m_divider->m_next); + return ptr; + } + + return NULL; + } + + private: + SpScUnboundedQueueOptimized(const SpScUnboundedQueueOptimized& _rhs); // no copy constructor + SpScUnboundedQueueOptimized& operator=(const SpScUnboundedQueueOptimized& _rhs); // no assignment operator + + struct Node + { + Node(Ty* _ptr) + : m_ptr(_ptr) + , m_next(NULL) + { + } + + Ty* m_ptr; + Node* m_next; + }; + + Node* m_first; + Node* m_divider; + Node* m_last; + }; + + template + class SpScUnboundedQueueNaive + { + public: + SpScUnboundedQueueNaive() + { + } + + ~SpScUnboundedQueueNaive() + { + BX_CHECK(m_queue.empty(), "Queue is not empty!"); + } + + void push(Ty* _item) + { + bx::LwMutexScope lock(m_mutex); + m_queue.push_back(_item); + } + + Ty* peek() + { + bx::LwMutexScope lock(m_mutex); + if (!m_queue.empty() ) + { + return m_queue.front(); + } + + return NULL; + } + + Ty* pop() + { + bx::LwMutexScope lock(m_mutex); + if (!m_queue.empty() ) + { + Ty* item = m_queue.front(); + m_queue.pop_front(); + return item; + } + + return NULL; + } + + private: + bx::LwMutex m_mutex; + std::list m_queue; + }; + +#if BX_CONFIG_SPSCQUEUE_USE_NAIVE +# define SpScUnboundedQueue SpScUnboundedQueueNaive +#else +# define SpScUnboundedQueue SpScUnboundedQueueOptimized +#endif // BX_CONFIG_NAIVE + +} // namespace bx + +#endif // __BX_RINGBUFFER_H__ diff --git a/include/bx/timer.h b/include/bx/timer.h new file mode 100644 index 0000000..dd335f8 --- /dev/null +++ b/include/bx/timer.h @@ -0,0 +1,53 @@ +/* + * Copyright 2010-2011 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#ifndef __BX_TIMER_H__ +#define __BX_TIMER_H__ + +#include "bx.h" + +#if BX_PLATFORM_ANDROID +# include // clock, clock_gettime +#elif BX_PLATFORM_NACL | BX_PLATFORM_LINUX +# include // gettimeofday +#endif // BX_PLATFORM_ + +namespace bx +{ + inline int64_t getHPCounter() + { +#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 + LARGE_INTEGER li; + // Performance counter value may unexpectedly leap forward + // http://support.microsoft.com/kb/274323 + QueryPerformanceCounter(&li); + int64_t i64 = li.QuadPart; +#elif BX_PLATFORM_ANDROID + int64_t i64 = clock(); +#else + struct timeval now; + gettimeofday(&now, 0); + int64_t i64 = now.tv_sec*1000000 + now.tv_usec; +#endif // BNET_PLATFORM_ + static int64_t offset = i64; + return i64 - offset; + } + + inline int64_t getHPFrequency() + { +#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360 + LARGE_INTEGER li; + QueryPerformanceFrequency(&li); + return li.QuadPart; +#elif BX_PLATFORM_ANDROID + return CLOCKS_PER_SEC; +#else + return 1000000; +#endif // BNET_PLATFORM_ + } + +} // namespace bx + +#endif // __BX_TIMER_H__ diff --git a/include/bx/uint32_t.h b/include/bx/uint32_t.h new file mode 100644 index 0000000..3817c7a --- /dev/null +++ b/include/bx/uint32_t.h @@ -0,0 +1,454 @@ +/* + * Copyright 2010-2012 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +// Copyright 2006 Mike Acton +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE + +#ifndef __BX_UINT32_T_H__ +#define __BX_UINT32_T_H__ + +#include "bx.h" + +#if BX_COMPILER_MSVC +# if BX_PLATFORM_WINDOWS +# include // math.h is included because VS bitches: + // warning C4985: 'ceil': attributes not present on previous declaration. + // must be included before intrin.h. +# include +# pragma intrinsic(_BitScanForward) +# pragma intrinsic(_BitScanReverse) +# endif // BX_PLATFORM_WINDOWS +#endif // BX_COMPILER_MSVC + +namespace bx +{ + inline uint32_t uint32_li(uint32_t _a) + { + return _a; + } + + inline uint32_t uint32_dec(uint32_t _a) + { + return _a - 1; + } + + inline uint32_t uint32_inc(uint32_t _a) + { + return _a + 1; + } + + inline uint32_t uint32_not(uint32_t _a) + { + return ~_a; + } + + inline uint32_t uint32_neg(uint32_t _a) + { + return -(int32_t)_a; + } + + inline uint32_t uint32_ext(uint32_t _a) + { + return ( (int32_t)_a)>>31; + } + + inline uint32_t uint32_and(uint32_t _a, uint32_t _b) + { + return _a & _b; + } + + inline uint32_t uint32_xor(uint32_t _a, uint32_t _b) + { + return _a ^ _b; + } + + inline uint32_t uint32_xorl(uint32_t _a, uint32_t _b) + { + return !_a != !_b; + } + + inline uint32_t uint32_andc(uint32_t _a, uint32_t _b) + { + return _a & ~_b; + } + + inline uint32_t uint32_or(uint32_t _a, uint32_t _b) + { + return _a | _b; + } + + inline uint32_t uint32_sll(uint32_t _a, int _sa) + { + return _a << _sa; + } + + inline uint32_t uint32_srl(uint32_t _a, int _sa) + { + return _a >> _sa; + } + + inline uint32_t uint32_sra(uint32_t _a, int _sa) + { + return ( (int32_t)_a) >> _sa; + } + + inline uint32_t uint32_rol(uint32_t _a, int _sa) + { + return ( _a << _sa) | (_a >> (32-_sa) ); + } + + inline uint32_t uint32_ror(uint32_t _a, int _sa) + { + return ( _a >> _sa) | (_a << (32-_sa) ); + } + + inline uint32_t uint32_add(uint32_t _a, uint32_t _b) + { + return _a + _b; + } + + inline uint32_t uint32_sub(uint32_t _a, uint32_t _b) + { + return _a - _b; + } + + inline uint32_t uint32_mul(uint32_t _a, uint32_t _b) + { + return _a * _b; + } + + inline uint32_t uint32_div(uint32_t _a, uint32_t _b) + { + return (_a / _b); + } + + inline uint32_t uint32_mod(uint32_t _a, uint32_t _b) + { + return (_a % _b); + } + + inline uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b) + { + return -(_a == _b); + } + + inline uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b) + { + return -(_a != _b); + } + + inline uint32_t uint32_cmplt(uint32_t _a, uint32_t _b) + { + return -(_a < _b); + } + + inline uint32_t uint32_cmple(uint32_t _a, uint32_t _b) + { + return -(_a <= _b); + } + + inline uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b) + { + return -(_a > _b); + } + + inline uint32_t uint32_cmpge(uint32_t _a, uint32_t _b) + { + return -(_a >= _b); + } + + inline uint32_t uint32_setnz(uint32_t _a) + { + return -!!_a; + } + + inline uint32_t uint32_satadd(uint32_t _a, uint32_t _b) + { + const uint32_t add = uint32_add(_a, _b); + const uint32_t lt = uint32_cmplt(add, _a); + const uint32_t result = uint32_or(add, lt); + + return result; + } + + inline uint32_t uint32_satsub(uint32_t _a, uint32_t _b) + { + const uint32_t sub = uint32_sub(_a, _b); + const uint32_t le = uint32_cmple(sub, _a); + const uint32_t result = uint32_and(sub, le); + + return result; + } + + inline uint32_t uint32_satmul(uint32_t _a, uint32_t _b) + { + const uint64_t mul = (uint64_t)_a * (uint64_t)_b; + const uint32_t hi = mul >> 32; + const uint32_t nz = uint32_setnz(hi); + const uint32_t result = uint32_or(uint32_t(mul), nz); + + return result; + } + + inline uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b) + { + const uint32_t mask = uint32_ext(test); + const uint32_t sel_a = uint32_and(_a, mask); + const uint32_t sel_b = uint32_andc(_b, mask); + const uint32_t result = uint32_or(sel_a, sel_b); + + return (result); + } + + inline uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b) + { + const uint32_t sel_a = uint32_and(_a, _mask); + const uint32_t sel_b = uint32_andc(_b, _mask); + const uint32_t result = uint32_or(sel_a, sel_b); + + return (result); + } + + inline uint32_t uint32_imin(uint32_t _a, uint32_t _b) + { + const uint32_t a_sub_b = uint32_sub(_a, _b); + const uint32_t result = uint32_sels(a_sub_b, _a, _b); + + return result; + } + + inline uint32_t uint32_imax(uint32_t _a, uint32_t _b) + { + const uint32_t b_sub_a = uint32_sub(_b, _a); + const uint32_t result = uint32_sels(b_sub_a, _a, _b); + + return result; + } + + inline uint32_t uint32_min(uint32_t _a, uint32_t _b) + { + return _a > _b ? _b : _a; + } + + inline uint32_t uint32_max(uint32_t _a, uint32_t _b) + { + return _a > _b ? _a : _b; + } + + inline uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max) + { + const uint32_t inc = uint32_inc(_val); + const uint32_t max_diff = uint32_sub(_max, _val); + const uint32_t neg_max_diff = uint32_neg(max_diff); + const uint32_t max_or = uint32_or(max_diff, neg_max_diff); + const uint32_t max_diff_nz = uint32_ext(max_or); + const uint32_t result = uint32_selb(max_diff_nz, inc, _min); + + return result; + } + + inline uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max) + { + const uint32_t dec = uint32_dec(_val); + const uint32_t min_diff = uint32_sub(_min, _val); + const uint32_t neg_min_diff = uint32_neg(min_diff); + const uint32_t min_or = uint32_or(min_diff, neg_min_diff); + const uint32_t min_diff_nz = uint32_ext(min_or); + const uint32_t result = uint32_selb(min_diff_nz, dec, _max); + + return result; + } + + inline uint32_t uint32_cntbits_ref(uint32_t _val) + { + const uint32_t tmp0 = uint32_srl(_val, 1); + const uint32_t tmp1 = uint32_and(tmp0, 0x55555555); + const uint32_t tmp2 = uint32_sub(_val, tmp1); + const uint32_t tmp3 = uint32_and(tmp2, 0xc30c30c3); + const uint32_t tmp4 = uint32_srl(tmp2, 2); + const uint32_t tmp5 = uint32_and(tmp4, 0xc30c30c3); + const uint32_t tmp6 = uint32_srl(tmp2, 4); + const uint32_t tmp7 = uint32_and(tmp6, 0xc30c30c3); + const uint32_t tmp8 = uint32_add(tmp3, tmp5); + const uint32_t tmp9 = uint32_add(tmp7, tmp8); + const uint32_t tmpA = uint32_srl(tmp9, 6); + const uint32_t tmpB = uint32_add(tmp9, tmpA); + const uint32_t tmpC = uint32_srl(tmpB, 12); + const uint32_t tmpD = uint32_srl(tmpB, 24); + const uint32_t tmpE = uint32_add(tmpB, tmpC); + const uint32_t tmpF = uint32_add(tmpD, tmpE); + const uint32_t result = uint32_and(tmpF, 0x3f); + + return result; + } + + /// Count number of bits set. + inline uint32_t uint32_cntbits(uint32_t _val) + { +#if BX_COMPILER_GCC + return __builtin_popcount(_val); +#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS + return __popcnt(_val); +#else + return uint32_cntbits_ref(_val); +#endif // BX_COMPILER_GCC + } + + inline uint32_t uint32_cntlz_ref(uint32_t _val) + { + const uint32_t tmp0 = uint32_srl(_val, 1); + const uint32_t tmp1 = uint32_or(tmp0, _val); + const uint32_t tmp2 = uint32_srl(tmp1, 2); + const uint32_t tmp3 = uint32_or(tmp2, tmp1); + const uint32_t tmp4 = uint32_srl(tmp3, 4); + const uint32_t tmp5 = uint32_or(tmp4, tmp3); + const uint32_t tmp6 = uint32_srl(tmp5, 8); + const uint32_t tmp7 = uint32_or(tmp6, tmp5); + const uint32_t tmp8 = uint32_srl(tmp7, 16); + const uint32_t tmp9 = uint32_or(tmp8, tmp7); + const uint32_t tmpA = uint32_not(tmp9); + const uint32_t result = uint32_cntbits(tmpA); + + return result; + } + + /// Count number of leading zeros. + inline uint32_t uint32_cntlz(uint32_t _val) + { +#if BX_COMPILER_GCC + return __builtin_clz(_val); +#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS + unsigned long index; + _BitScanReverse(&index, _val); + return 31 - index; +#else + return uint32_cntlz_ref(_val); +#endif // BX_COMPILER_ + } + + inline uint32_t uint32_cnttz_ref(uint32_t _val) + { + const uint32_t tmp0 = uint32_not(_val); + const uint32_t tmp1 = uint32_dec(_val); + const uint32_t tmp2 = uint32_and(tmp0, tmp1); + const uint32_t result = uint32_cntbits(tmp2); + + return result; + } + + inline uint32_t uint32_cnttz(uint32_t _val) + { +#if BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS + unsigned long index; + _BitScanForward(&index, _val); + return index; +#else + return uint32_cnttz_ref(_val); +#endif // BX_COMPILER_ + } + + // shuffle: + // ---- ---- ---- ---- fedc ba98 7654 3210 + // to: + // -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 + inline uint32_t uint32_part1by1(uint32_t _a) + { + const uint32_t val = uint32_and(_a, 0xffff); + + const uint32_t tmp0 = uint32_sll(val, 8); + const uint32_t tmp1 = uint32_xor(val, tmp0); + const uint32_t tmp2 = uint32_and(tmp1, 0x00ff00ff); + + const uint32_t tmp3 = uint32_sll(tmp2, 4); + const uint32_t tmp4 = uint32_xor(tmp2, tmp3); + const uint32_t tmp5 = uint32_and(tmp4, 0x0f0f0f0f); + + const uint32_t tmp6 = uint32_sll(tmp5, 2); + const uint32_t tmp7 = uint32_xor(tmp5, tmp6); + const uint32_t tmp8 = uint32_and(tmp7, 0x33333333); + + const uint32_t tmp9 = uint32_sll(tmp8, 1); + const uint32_t tmpA = uint32_xor(tmp8, tmp9); + const uint32_t result = uint32_and(tmpA, 0x55555555); + + return result; + } + + // shuffle: + // ---- ---- ---- ---- ---- --98 7654 3210 + // to: + // ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 + inline uint32_t uint32_part1by2(uint32_t _a) + { + const uint32_t val = uint32_and(_a, 0x3ff); + + const uint32_t tmp0 = uint32_sll(val, 16); + const uint32_t tmp1 = uint32_xor(val, tmp0); + const uint32_t tmp2 = uint32_and(tmp1, 0xff0000ff); + + const uint32_t tmp3 = uint32_sll(tmp2, 8); + const uint32_t tmp4 = uint32_xor(tmp2, tmp3); + const uint32_t tmp5 = uint32_and(tmp4, 0x0300f00f); + + const uint32_t tmp6 = uint32_sll(tmp5, 4); + const uint32_t tmp7 = uint32_xor(tmp5, tmp6); + const uint32_t tmp8 = uint32_and(tmp7, 0x030c30c3); + + const uint32_t tmp9 = uint32_sll(tmp8, 2); + const uint32_t tmpA = uint32_xor(tmp8, tmp9); + const uint32_t result = uint32_and(tmpA, 0x09249249); + + return result; + } + + inline uint32_t uint32_testpow2(uint32_t _a) + { + const uint32_t tmp0 = uint32_not(_a); + const uint32_t tmp1 = uint32_inc(tmp0); + const uint32_t tmp2 = uint32_and(_a, tmp1); + const uint32_t tmp3 = uint32_cmpeq(tmp2, _a); + const uint32_t tmp4 = uint32_cmpneq(_a, 0); + const uint32_t result = uint32_and(tmp3, tmp4); + + return result; + } + + inline uint32_t uint32_nextpow2(uint32_t _a) + { + const uint32_t tmp0 = uint32_dec(_a); + const uint32_t tmp1 = uint32_srl(tmp0, 1); + const uint32_t tmp2 = uint32_or(tmp0, tmp1); + const uint32_t tmp3 = uint32_srl(tmp2, 2); + const uint32_t tmp4 = uint32_or(tmp2, tmp3); + const uint32_t tmp5 = uint32_srl(tmp4, 4); + const uint32_t tmp6 = uint32_or(tmp4, tmp5); + const uint32_t tmp7 = uint32_srl(tmp6, 8); + const uint32_t tmp8 = uint32_or(tmp6, tmp7); + const uint32_t tmp9 = uint32_srl(tmp8, 16); + const uint32_t tmpA = uint32_or(tmp8, tmp9); + const uint32_t result = uint32_inc(tmpA); + + return result; + } +} // namespace bx + +#endif // __BX_UINT32_T_H__