Initial commit.

2026-02-17 12:42:34 +01:00 · 2012-04-03 20:17:55 -07:00
commit 4eb80393d1
30 changed files with 4389 additions and 0 deletions
--- a/22
+++ b/22
@@ -0,0 +1,22 @@
+Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,37 @@
+bx
+==
+
+Base library.
+
+Contact
+-------
+
+Twitter @bkaradzic
+
+Web http://www.stuckingeometry.com
+
+License
+-------
+
+Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/include/bx/blockalloc.h
+++ b/include/bx/blockalloc.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_BLOCKALLOC_H__
+#define __BX_BLOCKALLOC_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	class BlockAlloc
+	{
+	public:
+		static const uint16_t invalidIndex = 0xffff;
+		static const uint32_t minElementSize = 2;
+
+		BlockAlloc()
+			: m_data(NULL)
+			, m_num(0)
+			, m_size(0)
+			, m_numFree(0)
+			, m_freeIndex(invalidIndex)
+		{
+		}
+
+		BlockAlloc(void* _data, uint16_t _num, uint16_t _size)
+			: m_data(_data)
+			, m_num(_num)
+			, m_size(_size)
+			, m_numFree(_num)
+			, m_freeIndex(0)
+		{
+			char* data = (char*)_data;
+			uint16_t* index = (uint16_t*)_data;
+			for (uint16_t ii = 0; ii < m_num-1; ++ii)
+			{
+				*index = ii+1;
+				data += m_size;
+				index = (uint16_t*)data;
+			}
+			*index = invalidIndex;
+		}
+
+		~BlockAlloc()
+		{
+		}
+
+		void* alloc()
+		{
+			if (invalidIndex == m_freeIndex)
+			{
+				return NULL;
+			}
+
+			void* obj = ( (char*)m_data) + m_freeIndex*m_size;
+			m_freeIndex = *( (uint16_t*)obj);
+			--m_numFree;
+
+			return obj;
+		}
+
+		void free(void* _obj)
+		{
+			uint16_t index = getIndex(_obj);
+			BX_CHECK(index >= 0 && index < m_num, "index %d, m_num %d", index, m_num);
+
+			*( (uint16_t*)_obj) = m_freeIndex;
+			m_freeIndex = index;
+			++m_numFree;
+		}
+
+		uint16_t getIndex(void* _obj) const
+		{
+			return (uint16_t)( ( (char*)_obj - (char*)m_data ) / m_size);
+		}
+
+		uint16_t getNumFree() const
+		{
+			return m_numFree;
+		}
+
+		void* getFromIndex(uint16_t _index)
+		{
+			return (char*)m_data + _index*m_size;
+		}
+
+	private:
+		void* m_data;
+		uint16_t m_num;
+		uint16_t m_size;
+		uint16_t m_numFree;
+		uint16_t m_freeIndex;
+	};
+
+} // namespace bx
+
+#endif // __BX_BLOCKALLOC_H__
--- a/include/bx/bx.h
+++ b/include/bx/bx.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_H__
+#define __BX_H__
+
+#include "platform.h"
+#include "macros.h"
+
+namespace bx
+{
+}// namespace bx
+
+#ifndef BX_NAMESPACE
+#	define BX_NAMESPACE 0
+#elif BX_NAMESPACE
+using namespace bx;
+#endif // BX_NAMESPACE
+
+#endif // __BX_H__ 
+
--- a/include/bx/commandline.h
+++ b/include/bx/commandline.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_COMMANDLINE_H__
+#define __BX_COMMANDLINE_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	class CommandLine
+	{
+	public:
+		CommandLine()
+			: m_argc(__argc)
+			, m_argv(__argv)
+		{
+		}
+
+		CommandLine(int _argc, char const* const* _argv)
+			: m_argc(_argc)
+			, m_argv(_argv)
+		{
+		}
+
+		const char* findOption(const char _short, const char* _long = NULL, int _numParams = 1)
+		{
+			const char* result = _findOption(_short, _long, _numParams);
+			return result;
+		}
+
+		bool hasArg(const char _short, const char* _long = NULL)
+		{
+			const char* arg = findOption(_short, _long, 0);
+			return NULL != arg;
+		}
+
+		bool hasArg(const char* _long)
+		{
+			const char* arg = findOption('\0', _long, 0);
+			return NULL != arg;
+		}
+
+		bool hasArg(const char*& _value, const char _short, const char* _long = NULL)
+		{
+			const char* arg = findOption(_short, _long, 1);
+			_value = arg;
+			return NULL != arg;
+		}
+
+		bool hasArg(int& _value, const char _short, const char* _long = NULL)
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				_value = atoi(arg);
+				return true;
+			}
+
+			return false;
+		}
+
+		bool hasArg(unsigned int& _value, const char _short, const char* _long = NULL)
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				_value = atoi(arg);
+				return true;
+			}
+
+			return false;
+		}
+
+		bool hasArg(bool& _value, const char _short, const char* _long = NULL)
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				if ('0' == *arg || _stricmp(arg, "false") )
+				{
+					_value = false;
+				}
+				else if ('0' != *arg || _stricmp(arg, "true") )
+				{
+					_value = true;
+				}
+
+				return true;
+			}
+
+			return false;
+		}
+
+	private:
+		const char* _findOption(const char _short, const char* _long, int _numParams)
+		{
+			for (int ii = 0; ii < m_argc; ++ii)
+			{
+				const char* arg = m_argv[ii];
+				if ('-' == *arg)
+				{
+					++arg;
+					if (_short == *arg)
+					{
+						if (1 == strlen(arg) )
+						{
+							if (0 == _numParams)
+							{
+								return "";
+							}
+							else if (ii+_numParams < m_argc
+									&& '-' != *m_argv[ii+1] )
+							{
+								return m_argv[ii+1];
+							}
+
+							return NULL;
+						}
+					}
+					else if (NULL != _long
+							&&  '-' == *arg
+							&&  0 == _stricmp(arg+1, _long) )
+					{
+						if (0 == _numParams)
+						{
+							return "";
+						}
+						else if (ii+_numParams < m_argc
+								&&  '-' != *m_argv[ii+1] )
+						{
+							return m_argv[ii+1];
+						}
+
+						return NULL;
+					}
+				}
+			}
+
+			return NULL;
+		}
+
+		int m_argc;
+		char const* const* m_argv;
+	};
+
+} // namespace bx
+
+#endif /// __BX_COMMANDLINE_H__
--- a/include/bx/countof.h
+++ b/include/bx/countof.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_COUNTOF_H__
+#define __BX_COUNTOF_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	// http://cnicholson.net/2011/01/stupid-c-tricks-a-better-sizeof_array/
+	template<typename T, size_t N> char (&COUNTOF_REQUIRES_ARRAY_ARGUMENT(const T(&)[N]) )[N];
+#define countof(x) sizeof(bx::COUNTOF_REQUIRES_ARRAY_ARGUMENT(x) )
+
+} // namespace bx
+
+#endif // __BX_COUNTOF_H__
--- a/include/bx/cpu.h
+++ b/include/bx/cpu.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_CPU_H__
+#define __BX_CPU_H__
+
+#include "bx.h"
+
+#if BX_COMPILER_MSVC
+#	if BX_PLATFORM_XBOX360
+#		include <ppcintrinsics.h>
+#		include <xtl.h>
+#	else
+#		include <math.h> // math.h is included because VS bitches:
+						 // warning C4985: 'ceil': attributes not present on previous declaration.
+						 // must be included before intrin.h.
+#		include <intrin.h>
+#		include <windows.h>
+#	endif // !BX_PLATFORM_XBOX360
+extern "C" void _ReadBarrier();
+extern "C" void _WriteBarrier();
+extern "C" void _ReadWriteBarrier();
+#	pragma intrinsic(_ReadBarrier)
+#	pragma intrinsic(_WriteBarrier)
+#	pragma intrinsic(_ReadWriteBarrier)
+#	pragma intrinsic(_InterlockedIncrement)
+#	pragma intrinsic(_InterlockedDecrement)
+#endif // BX_COMPILER_MSVC
+
+namespace bx
+{
+#if BX_COMPILER_MSVC
+#	define BX_CACHE_LINE_ALIGN_MARKER() __declspec(align(BX_CACHE_LINE_SIZE) ) struct {}
+#else
+#	define BX_CACHE_LINE_ALIGN_MARKER() struct {} __attribute__( (__aligned__(BX_CACHE_LINE_SIZE) ) )
+#endif // BX_COMPILER_
+
+#define BX_CACHE_LINE_ALIGN(_def) BX_CACHE_LINE_ALIGN_MARKER(); _def; BX_CACHE_LINE_ALIGN_MARKER()
+
+	inline void readBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_ReadBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void writeBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_WriteBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void readWriteBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_ReadWriteBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void memoryBarrier()
+	{
+#if BX_PLATFORM_XBOX360
+		__lwsync();
+#elif BX_COMPILER_MSVC
+		_mm_mfence();
+#else
+		__sync_synchronize();
+//		asm volatile("mfence":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline int32_t atomicIncr(volatile void* _var)
+	{
+#if BX_COMPILER_MSVC
+		return _InterlockedIncrement( (volatile LONG*)(_var) );
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_fetch_and_add( (volatile int32_t*)_var, 1);
+#endif // BX_COMPILER
+	}
+
+	inline int32_t atomicDecr(volatile void* _var)
+	{
+#if BX_COMPILER_MSVC
+		return _InterlockedDecrement( (volatile LONG*)(_var) );
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_fetch_and_sub( (volatile int32_t*)_var, 1);
+#endif // BX_COMPILER
+	}
+
+	inline void* atomicExchangePtr(void** _target, void* _ptr)
+	{
+#if BX_COMPILER_MSVC
+		return InterlockedExchangePointer(_target, _ptr);
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_lock_test_and_set(_target, _ptr);
+#endif // BX_COMPILER
+	}
+
+} // namespace bx
+
+#endif // __BX_CPU_H__
--- a/include/bx/debug.h
+++ b/include/bx/debug.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_DEBUG_H__
+#define __BX_DEBUG_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	inline void debugBreak()
+	{
+#if BX_COMPILER_MSVC
+		__debugbreak();
+#elif BX_CPU_ARM
+		asm("bkpt 0");
+#elif !BX_PLATFORM_NACL && BX_CPU_X86 && (BX_COMPILER_GCC || BX_COMPILER_CLANG)
+		// NaCl doesn't like int 3:
+		// NativeClient: NaCl module load failed: Validation failure. File violates Native Client safety rules.
+		__asm__ ("int $3");
+#else // cross platform implementation
+		int* int3 = (int*)3L;
+		*int3 = 3;
+#endif // BX
+	}
+
+} // namespace bx
+
+#endif // __BX_DEBUG_H__
--- a/include/bx/endian.h
+++ b/include/bx/endian.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_ENDIAN_H__
+#define __BX_ENDIAN_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	inline uint16_t endianSwap(uint16_t _in)
+	{
+		return (_in>>8) | (_in<<8);
+	}
+	
+	inline uint32_t endianSwap(uint32_t _in)
+	{
+		return (_in>>24) | (_in<<24)
+			 | ( (_in&0x00ff0000)>>8) | ( (_in&0x0000ff00)<<8)
+			 ;
+	}
+
+	inline uint64_t endianSwap(uint64_t _in)
+	{
+		return (_in>>56) | (_in<<56)
+			 | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40)
+			 | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24)
+			 | ( (_in&UINT64_C(0x000000ff00000000) )>>8)  | ( (_in&UINT64_C(0x00000000ff000000) )<<8)
+			 ;
+	}
+
+	inline int16_t endianSwap(int16_t _in)
+	{
+		return (int16_t)endianSwap( (uint16_t)_in);
+	}
+
+	inline int32_t endianSwap(int32_t _in)
+	{
+		return (int32_t)endianSwap( (uint32_t)_in);
+	}
+
+	inline int64_t endianSwap(int64_t _in)
+	{
+		return (int64_t)endianSwap( (uint64_t)_in);
+	}
+
+	template <typename Ty>
+	inline Ty littleEndian(Ty& _in)
+	{
+#if BX_CPU_ENDIAN_BIG
+		endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_BIG
+	}
+
+	template <typename Ty>
+	inline Ty bigEndian(Ty& _in)
+	{
+#if BX_CPU_ENDIAN_LITTLE
+		return endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_LITTLE
+	}
+
+} // namespace bx
+
+#endif // __BX_ENDIAN_H__
--- a/include/bx/float4_neon.h
+++ b/include/bx/float4_neon.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_NEON_H__
+#define __BX_FLOAT4_NEON_H__
+
+#include <arm_neon.h>
+
+namespace bx
+{
+
+// Reference:
+// http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
+// http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+// http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/
+// http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/
+// http://blogs.arm.com/software-enablement/684-coding-for-neon-part-5-rearranging-vectors/
+
+	typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) );
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				float4_t result; \
+				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
+				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
+				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
+				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
+				return result; \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movelh_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movelh_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movehl_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movehl_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpacklo_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpacklo_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpackhi_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpackhi_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _a.fxyzw[0];
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		return _a.fxyzw[1];
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		return _a.fxyzw[2];
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		return _a.fxyzw[3];
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		const float32_t val[4] = {_x, _y, _z, _w};
+		return __builtin_neon_vld1v4sf(val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		const uint32_t val[4] = {_x, _y, _z, _w};
+		return (float4_t)__builtin_neon_vld1v4si( (const __builtin_neon_si*)val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return __builtin_neon_vdup_nv4sf(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		return (float4_t)__builtin_neon_vdup_nv4si( (__builtin_neon_si)_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return vdupq_n_f32(0.0f);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		return vaddq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		return vsubq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		return vmulq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		return vrecpeq_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		return vrsqrteq_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		return (float4_t)__builtin_neon_vandv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+	}
+
+	//BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	//{
+	//	return _mm_andnot_ps(_b, _a);
+	//}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		return (float4_t)__builtin_neon_vorrv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
+		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
+		const uint32x4_t add  = vaddq_u32(tmp0, tmp1);
+		const float4_t result = vreinterpretq_f32_u32(add);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
+		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
+		const uint32x4_t sub  = vsubq_u32(tmp0, tmp1);
+		const float4_t result = vreinterpretq_f32_u32(sub);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		const uint32x4_t tmp   = vreinterpretq_u32_f32(_a);
+		const uint32x4_t shift = vshlq_n_u32(tmp, _count);
+		const float4_t result  = vreinterpretq_f32_u32(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		const uint32x4_t tmp   = vreinterpretq_i32_f32(_a);
+		const uint32x4_t shift = (uint32x4_t)__builtin_neon_vshr_nv4si( (int32x4_t)tmp, _count, 0);
+		const float4_t result  = vreinterpretq_f32_u32(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		const int32x4_t a     = vreinterpretq_s32_f32(_a);
+		const int32x4_t shift = __builtin_neon_vshr_nv4si(a, _count, 1);
+		const float4_t result = vreinterpretq_f32_s32(shift);
+
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_div_nr float4_div_nr_ni
+#define float4_div float4_div_nr_ni
+#include "float4_ni.h"
+
+#endif // __BX_FLOAT4_NEON_H__
--- a/include/bx/float4_ni.h
+++ b/include/bx/float4_ni.h
@@ -0,0 +1,407 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_NI_H__
+#define __BX_FLOAT4_NI_H__
+
+namespace bx
+{
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
+		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
+		const float4_t result = float4_shuf_xyAB(xAyB, zCwD);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
+		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
+		const float4_t result = float4_shuf_zwCD(xAyB, zCwD);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c)
+	{
+		const float4_t mul    = float4_mul(_a, _b);
+		const float4_t result = float4_add(mul, _c);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c)
+	{
+		const float4_t mul    = float4_mul(_a, _b);
+		const float4_t result = float4_sub(_c, mul);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t oneish  = float4_isplat(0x3f800001);
+		const float4_t est     = float4_rcp_est(_b);
+		const float4_t iter0   = float4_mul(_a, est);
+		const float4_t tmp1    = float4_nmsub(_b, est, oneish);
+		const float4_t result  = float4_madd(tmp1, iter0, iter0);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a)
+	{
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t result = float4_div(one, _a);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a)
+	{
+		const float4_t zwxy   = float4_swiz_zwxy(_a);
+		const float4_t tmp0   = float4_or(_a, zwxy);
+		const float4_t tmp1   = float4_swiz_yyyy(_a);
+		const float4_t tmp2   = float4_or(tmp0, tmp1);
+		const float4_t mf000  = float4_ild(-1, 0, 0, 0);
+		const float4_t result = float4_and(tmp2, mf000);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t aorb   = float4_or(_a, _b);
+		const float4_t mffff  = float4_isplat(-1);
+		const float4_t result = float4_xor(aorb, mffff);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a)
+	{
+		const float4_t zero   = float4_zero();
+		const float4_t result = float4_sub(zero, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b)
+	{
+		const float4_t sel_a  = float4_and(_a, _mask);
+		const float4_t sel_b  = float4_andc(_b, _mask);
+		const float4_t result = float4_or(sel_a, sel_b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b)
+	{
+		const float4_t mask   = float4_sra(_test, 31);
+		const float4_t result = float4_selb(mask, _a, _b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a)
+	{
+		const float4_t mffff  = float4_isplat(-1);
+		const float4_t result = float4_xor(_a, mffff);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a)
+	{
+		const float4_t a_neg  = float4_neg(_a);
+		const float4_t result = float4_max(a_neg, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
+	{
+		const float4_t tmp    = float4_min(_a, _max);
+		const float4_t result = float4_max(tmp, _min);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s)
+	{
+		const float4_t ba     = float4_sub(_b, _a);
+		const float4_t result = float4_madd(_s, ba, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a)
+	{
+		const float4_t half   = float4_splat(0.5f);
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t zero   = float4_zero();
+		const float4_t tmp0   = float4_rsqrt_est(_a);
+		const float4_t tmp1   = float4_madd(tmp0, _a, zero);
+		const float4_t tmp2   = float4_madd(tmp1, half, zero);
+		const float4_t tmp3   = float4_nmsub(tmp0, tmp1, one);
+		const float4_t result = float4_madd(tmp3, tmp2, tmp1);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
+	{
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t sqrt   = float4_sqrt(_a);
+		const float4_t result = float4_div(one, sqrt);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a)
+	{
+		const float4_t rsqrt           = float4_rsqrt_est(_a);
+		const float4_t iter0           = float4_mul(_a, rsqrt);
+		const float4_t iter1           = float4_mul(iter0, rsqrt);
+		const float4_t half            = float4_splat(0.5f);
+		const float4_t half_rsqrt      = float4_mul(half, rsqrt);
+		const float4_t three           = float4_splat(3.0f);
+		const float4_t three_sub_iter1 = float4_sub(three, iter1);
+		const float4_t result          = float4_mul(half_rsqrt, three_sub_iter1);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a)
+	{
+		const float4_t half    = float4_splat(0.5f);
+		const float4_t ah      = float4_mul(half, _a);
+		const float4_t ashift  = float4_sra(_a, 1);
+		const float4_t magic   = float4_isplat(0x5f3759df);
+		const float4_t msuba   = float4_isub(magic, ashift);
+		const float4_t msubasq = float4_mul(msuba, msuba);
+		const float4_t tmp0    = float4_splat(1.5f);
+		const float4_t tmp1    = float4_mul(ah, msubasq);
+		const float4_t tmp2    = float4_sub(tmp0, tmp1);
+		const float4_t result  = float4_mul(msuba, tmp2);
+
+		return result;
+	}
+
+	namespace float4_logexp_detail
+	{
+		BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b)
+		{
+			return float4_splat(_b);
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly0  = float4_poly0(_a, _c);
+			const float4_t result = float4_madd(poly0, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly1(_a, _c, _d);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly2(_a, _c, _d, _e);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly3(_a, _c, _d, _e, _f);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly4(_a, _c, _d, _e, _f, _g);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a)
+		{
+#if 1
+			const float4_t result = float4_poly5(_a
+				, 3.11578814719469302614f, -3.32419399085241980044f
+				, 2.59883907202499966007f, -1.23152682416275988241f
+				, 0.318212422185251071475f, -0.0344359067839062357313f
+				);
+#elif 0
+			const float4_t result = float4_poly4(_a
+				, 2.8882704548164776201f, -2.52074962577807006663f
+				, 1.48116647521213171641f, -0.465725644288844778798f
+				, 0.0596515482674574969533f
+				);
+#elif 0
+			const float4_t result = float4_poly3(_a
+				, 2.61761038894603480148f, -1.75647175389045657003f
+				, 0.688243882994381274313f, -0.107254423828329604454f
+				);
+#else
+			const float4_t result = float4_poly2(_a
+				, 2.28330284476918490682f, -1.04913055217340124191f
+				, 0.204446009836232697516f
+				);
+#endif
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a)
+		{
+#if 1
+			const float4_t result = float4_poly5(_a
+				, 9.9999994e-1f, 6.9315308e-1f
+				, 2.4015361e-1f, 5.5826318e-2f
+				, 8.9893397e-3f, 1.8775767e-3f
+				);
+#elif 0
+			const float4_t result = float4_poly4(_a
+				, 1.0000026f, 6.9300383e-1f
+				, 2.4144275e-1f, 5.2011464e-2f
+				, 1.3534167e-2f
+				);
+#elif 0
+			const float4_t result = float4_poly3(_a
+				, 9.9992520e-1f, 6.9583356e-1f
+				, 2.2606716e-1f, 7.8024521e-2f
+				);
+#else
+			const float4_t result = float4_poly2(_a
+				, 1.0017247f, 6.5763628e-1f
+				, 3.3718944e-1f
+				);
+#endif // 0
+
+			return result;
+		}
+	} // namespace float4_internal
+
+	BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a)
+	{
+		const float4_t expmask  = float4_isplat(0x7f800000);
+		const float4_t mantmask = float4_isplat(0x007fffff);
+		const float4_t one      = float4_splat(1.0f);
+
+		const float4_t c127     = float4_isplat(127);
+		const float4_t aexp     = float4_and(_a, expmask);
+		const float4_t aexpsr   = float4_srl(aexp, 23);
+		const float4_t tmp0     = float4_isub(aexpsr, c127);
+		const float4_t exp      = float4_itof(tmp0);
+
+		const float4_t amask    = float4_and(_a, mantmask);
+		const float4_t mant     = float4_or(amask, one);
+
+		const float4_t poly     = float4_logexp_detail::float4_logpoly(mant);
+
+		const float4_t mandiff  = float4_sub(mant, one);
+		const float4_t result   = float4_madd(poly, mandiff, exp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a)
+	{
+		const float4_t min      = float4_splat( 129.0f);
+		const float4_t max      = float4_splat(-126.99999f);
+		const float4_t tmp0     = float4_min(_a, min);
+		const float4_t aaaa     = float4_max(tmp0, max);
+
+		const float4_t half     = float4_splat(0.5f);
+		const float4_t tmp2     = float4_sub(aaaa, half);
+		const float4_t ipart    = float4_ftoi(tmp2);
+		const float4_t iround   = float4_itof(ipart);
+		const float4_t fpart    = float4_sub(aaaa, iround);
+
+		const float4_t c127     = float4_isplat(127);
+		const float4_t tmp5     = float4_iadd(ipart, c127);
+		const float4_t expipart = float4_sll(tmp5, 23);
+
+		const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart);
+
+		const float4_t result   = float4_mul(expipart, expfpart);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t alog2  = float4_log2(_a);
+		const float4_t alog2b = float4_mul(alog2, _b);
+		const float4_t result = float4_exp2(alog2b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xyzw   = float4_mul(_a, _b);
+		const float4_t xxxx   = float4_swiz_xxxx(xyzw);
+		const float4_t yyyy   = float4_swiz_yyyy(xyzw);
+		const float4_t zzzz   = float4_swiz_zzzz(xyzw);
+		const float4_t tmp1   = float4_add(xxxx, yyyy);
+		const float4_t result = float4_add(zzzz, tmp1);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t a_yzxw = float4_swiz_yzxw(_a);
+		const float4_t a_zxyw = float4_swiz_zxyw(_a);
+		const float4_t b_zxyw = float4_swiz_zxyw(_b);
+		const float4_t b_yzxw = float4_swiz_yzxw(_b);
+		const float4_t tmp    = float4_mul(a_yzxw, b_zxyw);
+		const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a)
+	{
+		const float4_t dot3    = float4_dot3(_a, _a);
+		const float4_t invSqrt = float4_rsqrt(dot3);
+		const float4_t result  = float4_mul(_a, invSqrt);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xyzw   = float4_mul(_a, _b);
+		const float4_t yzwx   = float4_swiz_yzwx(xyzw);
+		const float4_t tmp0   = float4_add(xyzw, yzwx);
+		const float4_t zwxy   = float4_swiz_zwxy(tmp0);
+		const float4_t result = float4_add(tmp0, zwxy);
+
+		return result;
+	}
+
+} // namespace bx
+
+#endif // __BX_FLOAT4_NI_H__
--- a/include/bx/float4_ref.h
+++ b/include/bx/float4_ref.h
@@ -0,0 +1,522 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_REF_H__
+#define __BX_FLOAT4_REF_H__
+
+#include <math.h> // sqrtf
+
+namespace bx
+{
+	typedef union float4_t
+	{
+		int32_t  ixyzw[4];
+		uint32_t uxyzw[4];
+		float    fxyzw[4];
+
+	} float4_t;
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				float4_t result; \
+				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
+				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
+				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
+				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
+				return result; \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+#define IMPLEMENT_TEST(_xyzw, _mask) \
+			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
+			{ \
+				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
+				             | ( (_test.uxyzw[2]>>31)<<2) \
+				             | ( (_test.uxyzw[1]>>31)<<1) \
+				             | (_test.uxyzw[0]>>31) \
+				             ; \
+				return 0 != (tmp&(_mask) ); \
+			} \
+			\
+			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
+			{ \
+				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
+				             | ( (_test.uxyzw[2]>>31)<<2) \
+				             | ( (_test.uxyzw[1]>>31)<<1) \
+				             | (_test.uxyzw[0]>>31) \
+				             ; \
+				return (_mask) == (tmp&(_mask) ); \
+			}
+
+IMPLEMENT_TEST(x    , 0x1);
+IMPLEMENT_TEST(y    , 0x2);
+IMPLEMENT_TEST(xy   , 0x3);
+IMPLEMENT_TEST(z    , 0x4);
+IMPLEMENT_TEST(xz   , 0x5);
+IMPLEMENT_TEST(yz   , 0x6);
+IMPLEMENT_TEST(xyz  , 0x7);
+IMPLEMENT_TEST(w    , 0x8);
+IMPLEMENT_TEST(xw   , 0x9);
+IMPLEMENT_TEST(yw   , 0xa);
+IMPLEMENT_TEST(xyw  , 0xb);
+IMPLEMENT_TEST(zw   , 0xc);
+IMPLEMENT_TEST(xzw  , 0xd);
+IMPLEMENT_TEST(yzw  , 0xe);
+IMPLEMENT_TEST(xyzw , 0xf);
+
+#undef IMPLEMENT_TEST
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1];
+		result.uxyzw[2] = _b.uxyzw[0];
+		result.uxyzw[3] = _b.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[0];
+		result.uxyzw[1] = _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[0];
+		result.uxyzw[3] = _a.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[2];
+		result.uxyzw[1] = _b.uxyzw[3];
+		result.uxyzw[2] = _a.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[2];
+		result.uxyzw[1] = _a.uxyzw[3];
+		result.uxyzw[2] = _b.uxyzw[2];
+		result.uxyzw[3] = _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0];
+		result.uxyzw[1] = _b.uxyzw[0];
+		result.uxyzw[2] = _a.uxyzw[1];
+		result.uxyzw[3] = _b.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[1];
+		result.uxyzw[1] = _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[0];
+		result.uxyzw[3] = _b.uxyzw[0];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[2];
+		result.uxyzw[1] = _b.uxyzw[2];
+		result.uxyzw[2] = _a.uxyzw[3];
+		result.uxyzw[3] = _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[2];
+		result.uxyzw[1] = _a.uxyzw[2];
+		result.uxyzw[2] = _b.uxyzw[3];
+		result.uxyzw[3] = _a.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _a.fxyzw[0];
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		return _a.fxyzw[1];
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		return _a.fxyzw[2];
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		return _a.fxyzw[3];
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+	{
+		return *reinterpret_cast<const float4_t*>(_ptr);
+	}
+
+	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+	{
+		*reinterpret_cast<float4_t*>(_ptr) = _a;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		float4_t result;
+		result.fxyzw[0] = _x;
+		result.fxyzw[1] = _y;
+		result.fxyzw[2] = _z;
+		result.fxyzw[3] = _w;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		float4_t result;
+		result.uxyzw[0] = _x;
+		result.uxyzw[1] = _y;
+		result.uxyzw[2] = _z;
+		result.uxyzw[3] = _w;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
+	{
+		float val = *reinterpret_cast<const float*>(_ptr);
+		return float4_ld(val, val, val, val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return float4_ld(_a, _a, _a, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		return float4_ild(_a, _a, _a, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return float4_ild(0, 0, 0, 0);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = (float)result.ixyzw[0];
+		result.fxyzw[1] = (float)result.ixyzw[1];
+		result.fxyzw[2] = (float)result.ixyzw[2];
+		result.fxyzw[3] = (float)result.ixyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
+	{
+		float4_t result;
+		result.ixyzw[0] = (int)result.fxyzw[0];
+		result.ixyzw[1] = (int)result.fxyzw[1];
+		result.ixyzw[2] = (int)result.fxyzw[2];
+		result.ixyzw[3] = (int)result.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
+	{
+		const float4_t tmp    = float4_ftoi(_a);
+		const float4_t result = float4_itof(tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = 1.0f / _a.fxyzw[0];
+		result.fxyzw[1] = 1.0f / _a.fxyzw[1];
+		result.fxyzw[2] = 1.0f / _a.fxyzw[2];
+		result.fxyzw[3] = 1.0f / _a.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = sqrtf(_a.fxyzw[0]);
+		result.fxyzw[1] = sqrtf(_a.fxyzw[1]);
+		result.fxyzw[2] = sqrtf(_a.fxyzw[2]);
+		result.fxyzw[3] = sqrtf(_a.fxyzw[3]);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]);
+		result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]);
+		result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]);
+		result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] << _count;
+		result.uxyzw[1] = _a.uxyzw[1] << _count;
+		result.uxyzw[2] = _a.uxyzw[2] << _count;
+		result.uxyzw[3] = _a.uxyzw[3] << _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] >> _count;
+		result.uxyzw[1] = _a.uxyzw[1] >> _count;
+		result.uxyzw[2] = _a.uxyzw[2] >> _count;
+		result.uxyzw[3] = _a.uxyzw[3] >> _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] >> _count;
+		result.ixyzw[1] = _a.ixyzw[1] >> _count;
+		result.ixyzw[2] = _a.ixyzw[2] >> _count;
+		result.ixyzw[3] = _a.ixyzw[3] >> _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0];
+		result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1];
+		result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2];
+		result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0];
+		result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1];
+		result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2];
+		result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3];
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_shuf_xAzC float4_shuf_xAzC_ni
+#define float4_shuf_yBwD float4_shuf_yBwD_ni
+#define float4_rcp float4_rcp_ni
+#define float4_orx float4_orx_ni
+#define float4_orc float4_orc_ni
+#define float4_neg float4_neg_ni
+#define float4_madd float4_madd_ni
+#define float4_nmsub float4_nmsub_ni
+#define float4_div_nr float4_div_nr_ni
+#define float4_selb float4_selb_ni
+#define float4_sels float4_sels_ni
+#define float4_not float4_not_ni
+#define float4_abs float4_abs_ni
+#define float4_clamp float4_clamp_ni
+#define float4_lerp float4_lerp_ni
+#define float4_rsqrt float4_rsqrt_ni
+#define float4_rsqrt_nr float4_rsqrt_nr_ni
+#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
+#define float4_sqrt_nr float4_sqrt_nr_ni
+#define float4_log2 float4_log2_ni
+#define float4_exp2 float4_exp2_ni
+#define float4_pow float4_pow_ni
+#define float4_cross3 float4_cross3_ni
+#define float4_normalize3 float4_normalize3_ni
+#define float4_dot3 float4_dot3_ni
+#define float4_dot float4_dot_ni
+#include "float4_ni.h"
+
+#endif // __BX_FLOAT4_REF_H__
--- a/include/bx/float4_sse.h
+++ b/include/bx/float4_sse.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_SSE_H__
+#define __BX_FLOAT4_SSE_H__
+
+#if !defined(__SSE2__)
+#	error "float4_t requires at least SSE2"
+#endif // !defined(__SSE2__)
+
+#include <stdint.h>
+
+#include <emmintrin.h> // __m128i
+#if defined(__SSE4_1__)
+#	include <smmintrin.h>
+#endif // defined(__SSE4_1__)
+#include <xmmintrin.h> // __m128
+
+namespace bx
+{
+
+	typedef __m128 float4_t;
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+#define IMPLEMENT_TEST(_xyzw, _mask) \
+			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
+			{ \
+				return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
+			} \
+			\
+			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
+			{ \
+				return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
+			}
+
+IMPLEMENT_TEST(x    , 0x1);
+IMPLEMENT_TEST(y    , 0x2);
+IMPLEMENT_TEST(xy   , 0x3);
+IMPLEMENT_TEST(z    , 0x4);
+IMPLEMENT_TEST(xz   , 0x5);
+IMPLEMENT_TEST(yz   , 0x6);
+IMPLEMENT_TEST(xyz  , 0x7);
+IMPLEMENT_TEST(w    , 0x8);
+IMPLEMENT_TEST(xw   , 0x9);
+IMPLEMENT_TEST(yw   , 0xa);
+IMPLEMENT_TEST(xyw  , 0xb);
+IMPLEMENT_TEST(zw   , 0xc);
+IMPLEMENT_TEST(xzw  , 0xd);
+IMPLEMENT_TEST(yzw  , 0xe);
+IMPLEMENT_TEST(xyzw , 0xf);
+
+#undef IMPLEMENT_TEST
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		return _mm_movelh_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		return _mm_movelh_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		return _mm_movehl_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		return _mm_movehl_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		return _mm_unpacklo_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		return _mm_unpacklo_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		return _mm_unpackhi_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		return _mm_unpackhi_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _mm_cvtss_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		const float4_t yyyy = float4_swiz_yyyy(_a);
+		const float result  = _mm_cvtss_f32(yyyy);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		const float4_t zzzz = float4_swiz_zzzz(_a);
+		const float result  = _mm_cvtss_f32(zzzz);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		const float4_t wwww = float4_swiz_wwww(_a);
+		const float result  = _mm_cvtss_f32(wwww);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+	{
+		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
+	}
+
+	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+	{
+		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		return _mm_set_ps(_w, _z, _y, _x);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		const __m128i set     = _mm_set_epi32(_w, _z, _y, _x);
+		const float4_t result = _mm_castsi128_ps(set);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
+	{
+		const float4_t x___   = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
+		const float4_t result = float4_swiz_xxxx(x___);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return _mm_set1_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		const __m128i splat   = _mm_set1_epi32(_a);
+		const float4_t result = _mm_castsi128_ps(splat);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return _mm_setzero_ps();
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
+	{
+		const __m128i  itof   = _mm_castps_si128(_a);
+		const float4_t result = _mm_cvtepi32_ps(itof);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
+	{
+		const __m128i ftoi    = _mm_cvtps_epi32(_a);
+		const float4_t result = _mm_castsi128_ps(ftoi);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
+	{
+#if defined(__SSE4_1__)
+		return _mm_round_ps(_a, _MM_FROUND_NINT);
+#else
+		const __m128i round   = _mm_cvtps_epi32(_a);
+		const float4_t result = _mm_cvtepi32_ps(round);
+
+		return result;
+#endif // defined(__SSE4_1__)
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		return _mm_add_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		return _mm_sub_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		return _mm_mul_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
+	{
+		return _mm_div_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		return _mm_rcp_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
+	{
+		return _mm_sqrt_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		return _mm_rsqrt_ps(_a);
+	}
+
+#if defined(__SSE4_1__)
+	BX_FLOAT4_INLINE float4_t float4_dot3(float4_t _a, float4_t _b)
+	{
+		return _mm_dp_ps(_a, _b, 0x77);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot(float4_t _a, float4_t _b)
+	{
+		return _mm_dp_ps(_a, _b, 0xFF);
+	}
+#endif // defined(__SSE4__)
+
+	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpeq_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
+	{
+		return _mm_cmplt_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
+	{
+		return _mm_cmple_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpgt_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpge_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
+	{
+		return _mm_min_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	{
+		return _mm_max_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		return _mm_and_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	{
+		return _mm_andnot_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		return _mm_or_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
+	{
+		return _mm_xor_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_slli_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_srli_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_srai_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i b       = _mm_castps_si128(_b);
+		const __m128i add     = _mm_add_epi32(a, b);
+		const float4_t result = _mm_castsi128_ps(add);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i b       = _mm_castps_si128(_b);
+		const __m128i sub     = _mm_sub_epi32(a, b);
+		const float4_t result = _mm_castsi128_ps(sub);
+
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_shuf_xAzC float4_shuf_xAzC_ni
+#define float4_shuf_yBwD float4_shuf_yBwD_ni
+#define float4_rcp float4_rcp_ni
+#define float4_orx float4_orx_ni
+#define float4_orc float4_orc_ni
+#define float4_neg float4_neg_ni
+#define float4_madd float4_madd_ni
+#define float4_nmsub float4_nmsub_ni
+#define float4_div_nr float4_div_nr_ni
+#define float4_selb float4_selb_ni
+#define float4_sels float4_sels_ni
+#define float4_not float4_not_ni
+#define float4_abs float4_abs_ni
+#define float4_clamp float4_clamp_ni
+#define float4_lerp float4_lerp_ni
+#define float4_rsqrt float4_rsqrt_ni
+#define float4_rsqrt_nr float4_rsqrt_nr_ni
+#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
+#define float4_sqrt_nr float4_sqrt_nr_ni
+#define float4_log2 float4_log2_ni
+#define float4_exp2 float4_exp2_ni
+#define float4_pow float4_pow_ni
+#define float4_cross3 float4_cross3_ni
+#define float4_normalize3 float4_normalize3_ni
+#if !defined(__SSE4_1__)
+#define float4_dot3 float4_dot3_ni
+#define float4_dot float4_dot_ni
+#endif // defined(__SSE4_1__)
+#include "float4_ni.h"
+
+#endif // __FLOAT4_SSE_H__
--- a/include/bx/float4_swizzle.inl
+++ b/include/bx/float4_swizzle.inl
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_T_H__
+#	error "xmacro file, must be included from float4_*.h"
+#endif // __BX_FLOAT4_T_H__
+
+// included from float4_t.h
+IMPLEMENT_SWIZZLE(x, x, x, x)
+IMPLEMENT_SWIZZLE(x, x, x, y)
+IMPLEMENT_SWIZZLE(x, x, x, z)
+IMPLEMENT_SWIZZLE(x, x, x, w)
+IMPLEMENT_SWIZZLE(x, x, y, x)
+IMPLEMENT_SWIZZLE(x, x, y, y)
+IMPLEMENT_SWIZZLE(x, x, y, z)
+IMPLEMENT_SWIZZLE(x, x, y, w)
+IMPLEMENT_SWIZZLE(x, x, z, x)
+IMPLEMENT_SWIZZLE(x, x, z, y)
+IMPLEMENT_SWIZZLE(x, x, z, z)
+IMPLEMENT_SWIZZLE(x, x, z, w)
+IMPLEMENT_SWIZZLE(x, x, w, x)
+IMPLEMENT_SWIZZLE(x, x, w, y)
+IMPLEMENT_SWIZZLE(x, x, w, z)
+IMPLEMENT_SWIZZLE(x, x, w, w)
+IMPLEMENT_SWIZZLE(x, y, x, x)
+IMPLEMENT_SWIZZLE(x, y, x, y)
+IMPLEMENT_SWIZZLE(x, y, x, z)
+IMPLEMENT_SWIZZLE(x, y, x, w)
+IMPLEMENT_SWIZZLE(x, y, y, x)
+IMPLEMENT_SWIZZLE(x, y, y, y)
+IMPLEMENT_SWIZZLE(x, y, y, z)
+IMPLEMENT_SWIZZLE(x, y, y, w)
+IMPLEMENT_SWIZZLE(x, y, z, x)
+IMPLEMENT_SWIZZLE(x, y, z, y)
+IMPLEMENT_SWIZZLE(x, y, z, z)
+// IMPLEMENT_SWIZZLE(x, y, z, w)
+IMPLEMENT_SWIZZLE(x, y, w, x)
+IMPLEMENT_SWIZZLE(x, y, w, y)
+IMPLEMENT_SWIZZLE(x, y, w, z)
+IMPLEMENT_SWIZZLE(x, y, w, w)
+IMPLEMENT_SWIZZLE(x, z, x, x)
+IMPLEMENT_SWIZZLE(x, z, x, y)
+IMPLEMENT_SWIZZLE(x, z, x, z)
+IMPLEMENT_SWIZZLE(x, z, x, w)
+IMPLEMENT_SWIZZLE(x, z, y, x)
+IMPLEMENT_SWIZZLE(x, z, y, y)
+IMPLEMENT_SWIZZLE(x, z, y, z)
+IMPLEMENT_SWIZZLE(x, z, y, w)
+IMPLEMENT_SWIZZLE(x, z, z, x)
+IMPLEMENT_SWIZZLE(x, z, z, y)
+IMPLEMENT_SWIZZLE(x, z, z, z)
+IMPLEMENT_SWIZZLE(x, z, z, w)
+IMPLEMENT_SWIZZLE(x, z, w, x)
+IMPLEMENT_SWIZZLE(x, z, w, y)
+IMPLEMENT_SWIZZLE(x, z, w, z)
+IMPLEMENT_SWIZZLE(x, z, w, w)
+IMPLEMENT_SWIZZLE(x, w, x, x)
+IMPLEMENT_SWIZZLE(x, w, x, y)
+IMPLEMENT_SWIZZLE(x, w, x, z)
+IMPLEMENT_SWIZZLE(x, w, x, w)
+IMPLEMENT_SWIZZLE(x, w, y, x)
+IMPLEMENT_SWIZZLE(x, w, y, y)
+IMPLEMENT_SWIZZLE(x, w, y, z)
+IMPLEMENT_SWIZZLE(x, w, y, w)
+IMPLEMENT_SWIZZLE(x, w, z, x)
+IMPLEMENT_SWIZZLE(x, w, z, y)
+IMPLEMENT_SWIZZLE(x, w, z, z)
+IMPLEMENT_SWIZZLE(x, w, z, w)
+IMPLEMENT_SWIZZLE(x, w, w, x)
+IMPLEMENT_SWIZZLE(x, w, w, y)
+IMPLEMENT_SWIZZLE(x, w, w, z)
+IMPLEMENT_SWIZZLE(x, w, w, w)
+IMPLEMENT_SWIZZLE(y, x, x, x)
+IMPLEMENT_SWIZZLE(y, x, x, y)
+IMPLEMENT_SWIZZLE(y, x, x, z)
+IMPLEMENT_SWIZZLE(y, x, x, w)
+IMPLEMENT_SWIZZLE(y, x, y, x)
+IMPLEMENT_SWIZZLE(y, x, y, y)
+IMPLEMENT_SWIZZLE(y, x, y, z)
+IMPLEMENT_SWIZZLE(y, x, y, w)
+IMPLEMENT_SWIZZLE(y, x, z, x)
+IMPLEMENT_SWIZZLE(y, x, z, y)
+IMPLEMENT_SWIZZLE(y, x, z, z)
+IMPLEMENT_SWIZZLE(y, x, z, w)
+IMPLEMENT_SWIZZLE(y, x, w, x)
+IMPLEMENT_SWIZZLE(y, x, w, y)
+IMPLEMENT_SWIZZLE(y, x, w, z)
+IMPLEMENT_SWIZZLE(y, x, w, w)
+IMPLEMENT_SWIZZLE(y, y, x, x)
+IMPLEMENT_SWIZZLE(y, y, x, y)
+IMPLEMENT_SWIZZLE(y, y, x, z)
+IMPLEMENT_SWIZZLE(y, y, x, w)
+IMPLEMENT_SWIZZLE(y, y, y, x)
+IMPLEMENT_SWIZZLE(y, y, y, y)
+IMPLEMENT_SWIZZLE(y, y, y, z)
+IMPLEMENT_SWIZZLE(y, y, y, w)
+IMPLEMENT_SWIZZLE(y, y, z, x)
+IMPLEMENT_SWIZZLE(y, y, z, y)
+IMPLEMENT_SWIZZLE(y, y, z, z)
+IMPLEMENT_SWIZZLE(y, y, z, w)
+IMPLEMENT_SWIZZLE(y, y, w, x)
+IMPLEMENT_SWIZZLE(y, y, w, y)
+IMPLEMENT_SWIZZLE(y, y, w, z)
+IMPLEMENT_SWIZZLE(y, y, w, w)
+IMPLEMENT_SWIZZLE(y, z, x, x)
+IMPLEMENT_SWIZZLE(y, z, x, y)
+IMPLEMENT_SWIZZLE(y, z, x, z)
+IMPLEMENT_SWIZZLE(y, z, x, w)
+IMPLEMENT_SWIZZLE(y, z, y, x)
+IMPLEMENT_SWIZZLE(y, z, y, y)
+IMPLEMENT_SWIZZLE(y, z, y, z)
+IMPLEMENT_SWIZZLE(y, z, y, w)
+IMPLEMENT_SWIZZLE(y, z, z, x)
+IMPLEMENT_SWIZZLE(y, z, z, y)
+IMPLEMENT_SWIZZLE(y, z, z, z)
+IMPLEMENT_SWIZZLE(y, z, z, w)
+IMPLEMENT_SWIZZLE(y, z, w, x)
+IMPLEMENT_SWIZZLE(y, z, w, y)
+IMPLEMENT_SWIZZLE(y, z, w, z)
+IMPLEMENT_SWIZZLE(y, z, w, w)
+IMPLEMENT_SWIZZLE(y, w, x, x)
+IMPLEMENT_SWIZZLE(y, w, x, y)
+IMPLEMENT_SWIZZLE(y, w, x, z)
+IMPLEMENT_SWIZZLE(y, w, x, w)
+IMPLEMENT_SWIZZLE(y, w, y, x)
+IMPLEMENT_SWIZZLE(y, w, y, y)
+IMPLEMENT_SWIZZLE(y, w, y, z)
+IMPLEMENT_SWIZZLE(y, w, y, w)
+IMPLEMENT_SWIZZLE(y, w, z, x)
+IMPLEMENT_SWIZZLE(y, w, z, y)
+IMPLEMENT_SWIZZLE(y, w, z, z)
+IMPLEMENT_SWIZZLE(y, w, z, w)
+IMPLEMENT_SWIZZLE(y, w, w, x)
+IMPLEMENT_SWIZZLE(y, w, w, y)
+IMPLEMENT_SWIZZLE(y, w, w, z)
+IMPLEMENT_SWIZZLE(y, w, w, w)
+IMPLEMENT_SWIZZLE(z, x, x, x)
+IMPLEMENT_SWIZZLE(z, x, x, y)
+IMPLEMENT_SWIZZLE(z, x, x, z)
+IMPLEMENT_SWIZZLE(z, x, x, w)
+IMPLEMENT_SWIZZLE(z, x, y, x)
+IMPLEMENT_SWIZZLE(z, x, y, y)
+IMPLEMENT_SWIZZLE(z, x, y, z)
+IMPLEMENT_SWIZZLE(z, x, y, w)
+IMPLEMENT_SWIZZLE(z, x, z, x)
+IMPLEMENT_SWIZZLE(z, x, z, y)
+IMPLEMENT_SWIZZLE(z, x, z, z)
+IMPLEMENT_SWIZZLE(z, x, z, w)
+IMPLEMENT_SWIZZLE(z, x, w, x)
+IMPLEMENT_SWIZZLE(z, x, w, y)
+IMPLEMENT_SWIZZLE(z, x, w, z)
+IMPLEMENT_SWIZZLE(z, x, w, w)
+IMPLEMENT_SWIZZLE(z, y, x, x)
+IMPLEMENT_SWIZZLE(z, y, x, y)
+IMPLEMENT_SWIZZLE(z, y, x, z)
+IMPLEMENT_SWIZZLE(z, y, x, w)
+IMPLEMENT_SWIZZLE(z, y, y, x)
+IMPLEMENT_SWIZZLE(z, y, y, y)
+IMPLEMENT_SWIZZLE(z, y, y, z)
+IMPLEMENT_SWIZZLE(z, y, y, w)
+IMPLEMENT_SWIZZLE(z, y, z, x)
+IMPLEMENT_SWIZZLE(z, y, z, y)
+IMPLEMENT_SWIZZLE(z, y, z, z)
+IMPLEMENT_SWIZZLE(z, y, z, w)
+IMPLEMENT_SWIZZLE(z, y, w, x)
+IMPLEMENT_SWIZZLE(z, y, w, y)
+IMPLEMENT_SWIZZLE(z, y, w, z)
+IMPLEMENT_SWIZZLE(z, y, w, w)
+IMPLEMENT_SWIZZLE(z, z, x, x)
+IMPLEMENT_SWIZZLE(z, z, x, y)
+IMPLEMENT_SWIZZLE(z, z, x, z)
+IMPLEMENT_SWIZZLE(z, z, x, w)
+IMPLEMENT_SWIZZLE(z, z, y, x)
+IMPLEMENT_SWIZZLE(z, z, y, y)
+IMPLEMENT_SWIZZLE(z, z, y, z)
+IMPLEMENT_SWIZZLE(z, z, y, w)
+IMPLEMENT_SWIZZLE(z, z, z, x)
+IMPLEMENT_SWIZZLE(z, z, z, y)
+IMPLEMENT_SWIZZLE(z, z, z, z)
+IMPLEMENT_SWIZZLE(z, z, z, w)
+IMPLEMENT_SWIZZLE(z, z, w, x)
+IMPLEMENT_SWIZZLE(z, z, w, y)
+IMPLEMENT_SWIZZLE(z, z, w, z)
+IMPLEMENT_SWIZZLE(z, z, w, w)
+IMPLEMENT_SWIZZLE(z, w, x, x)
+IMPLEMENT_SWIZZLE(z, w, x, y)
+IMPLEMENT_SWIZZLE(z, w, x, z)
+IMPLEMENT_SWIZZLE(z, w, x, w)
+IMPLEMENT_SWIZZLE(z, w, y, x)
+IMPLEMENT_SWIZZLE(z, w, y, y)
+IMPLEMENT_SWIZZLE(z, w, y, z)
+IMPLEMENT_SWIZZLE(z, w, y, w)
+IMPLEMENT_SWIZZLE(z, w, z, x)
+IMPLEMENT_SWIZZLE(z, w, z, y)
+IMPLEMENT_SWIZZLE(z, w, z, z)
+IMPLEMENT_SWIZZLE(z, w, z, w)
+IMPLEMENT_SWIZZLE(z, w, w, x)
+IMPLEMENT_SWIZZLE(z, w, w, y)
+IMPLEMENT_SWIZZLE(z, w, w, z)
+IMPLEMENT_SWIZZLE(z, w, w, w)
+IMPLEMENT_SWIZZLE(w, x, x, x)
+IMPLEMENT_SWIZZLE(w, x, x, y)
+IMPLEMENT_SWIZZLE(w, x, x, z)
+IMPLEMENT_SWIZZLE(w, x, x, w)
+IMPLEMENT_SWIZZLE(w, x, y, x)
+IMPLEMENT_SWIZZLE(w, x, y, y)
+IMPLEMENT_SWIZZLE(w, x, y, z)
+IMPLEMENT_SWIZZLE(w, x, y, w)
+IMPLEMENT_SWIZZLE(w, x, z, x)
+IMPLEMENT_SWIZZLE(w, x, z, y)
+IMPLEMENT_SWIZZLE(w, x, z, z)
+IMPLEMENT_SWIZZLE(w, x, z, w)
+IMPLEMENT_SWIZZLE(w, x, w, x)
+IMPLEMENT_SWIZZLE(w, x, w, y)
+IMPLEMENT_SWIZZLE(w, x, w, z)
+IMPLEMENT_SWIZZLE(w, x, w, w)
+IMPLEMENT_SWIZZLE(w, y, x, x)
+IMPLEMENT_SWIZZLE(w, y, x, y)
+IMPLEMENT_SWIZZLE(w, y, x, z)
+IMPLEMENT_SWIZZLE(w, y, x, w)
+IMPLEMENT_SWIZZLE(w, y, y, x)
+IMPLEMENT_SWIZZLE(w, y, y, y)
+IMPLEMENT_SWIZZLE(w, y, y, z)
+IMPLEMENT_SWIZZLE(w, y, y, w)
+IMPLEMENT_SWIZZLE(w, y, z, x)
+IMPLEMENT_SWIZZLE(w, y, z, y)
+IMPLEMENT_SWIZZLE(w, y, z, z)
+IMPLEMENT_SWIZZLE(w, y, z, w)
+IMPLEMENT_SWIZZLE(w, y, w, x)
+IMPLEMENT_SWIZZLE(w, y, w, y)
+IMPLEMENT_SWIZZLE(w, y, w, z)
+IMPLEMENT_SWIZZLE(w, y, w, w)
+IMPLEMENT_SWIZZLE(w, z, x, x)
+IMPLEMENT_SWIZZLE(w, z, x, y)
+IMPLEMENT_SWIZZLE(w, z, x, z)
+IMPLEMENT_SWIZZLE(w, z, x, w)
+IMPLEMENT_SWIZZLE(w, z, y, x)
+IMPLEMENT_SWIZZLE(w, z, y, y)
+IMPLEMENT_SWIZZLE(w, z, y, z)
+IMPLEMENT_SWIZZLE(w, z, y, w)
+IMPLEMENT_SWIZZLE(w, z, z, x)
+IMPLEMENT_SWIZZLE(w, z, z, y)
+IMPLEMENT_SWIZZLE(w, z, z, z)
+IMPLEMENT_SWIZZLE(w, z, z, w)
+IMPLEMENT_SWIZZLE(w, z, w, x)
+IMPLEMENT_SWIZZLE(w, z, w, y)
+IMPLEMENT_SWIZZLE(w, z, w, z)
+IMPLEMENT_SWIZZLE(w, z, w, w)
+IMPLEMENT_SWIZZLE(w, w, x, x)
+IMPLEMENT_SWIZZLE(w, w, x, y)
+IMPLEMENT_SWIZZLE(w, w, x, z)
+IMPLEMENT_SWIZZLE(w, w, x, w)
+IMPLEMENT_SWIZZLE(w, w, y, x)
+IMPLEMENT_SWIZZLE(w, w, y, y)
+IMPLEMENT_SWIZZLE(w, w, y, z)
+IMPLEMENT_SWIZZLE(w, w, y, w)
+IMPLEMENT_SWIZZLE(w, w, z, x)
+IMPLEMENT_SWIZZLE(w, w, z, y)
+IMPLEMENT_SWIZZLE(w, w, z, z)
+IMPLEMENT_SWIZZLE(w, w, z, w)
+IMPLEMENT_SWIZZLE(w, w, w, x)
+IMPLEMENT_SWIZZLE(w, w, w, y)
+IMPLEMENT_SWIZZLE(w, w, w, z)
+IMPLEMENT_SWIZZLE(w, w, w, w)
--- a/include/bx/float4_t.h
+++ b/include/bx/float4_t.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_T_H__
+#define __BX_FLOAT4_T_H__
+
+#include <stdint.h>
+#include "bx.h"
+
+#define BX_FLOAT4_INLINE BX_FORCE_INLINE
+
+#if 0 // defined(__SSE2__)
+#	include "float4_sse.h"
+#elif 0 // __ARM_NEON__
+#	include "float4_neon.h"
+#else
+#	include "float4_ref.h"
+#endif //
+
+#endif // __BX_FLOAT4_T_H__
--- a/include/bx/float4x4_t.h
+++ b/include/bx/float4x4_t.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4X4_H__
+#define __BX_FLOAT4x4_H__
+
+#include "float4_t.h"
+
+namespace bx
+{
+	typedef BX_ALIGN_STRUCT_16(struct)
+	{
+		float4_t col[4];
+
+	} float4x4_t;
+
+	BX_FLOAT4_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t& _b)
+	{
+		const float4_t xxxx   = float4_swiz_xxxx(_a);
+		const float4_t yyyy   = float4_swiz_yyyy(_a);
+		const float4_t zzzz   = float4_swiz_zzzz(_a);
+		const float4_t col0   = float4_mul(_b.col[0], xxxx);
+		const float4_t col1   = float4_mul(_b.col[1], yyyy);
+		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
+		const float4_t col3   = float4_add(_b.col[3], col1);
+		const float4_t result = float4_add(col2, col3);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, const float4x4_t& _b)
+	{
+		const float4_t xxxx   = float4_swiz_xxxx(_a);
+		const float4_t yyyy   = float4_swiz_yyyy(_a);
+		const float4_t zzzz   = float4_swiz_zzzz(_a);
+		const float4_t wwww   = float4_swiz_wwww(_a);
+		const float4_t col0   = float4_mul(_b.col[0], xxxx);
+		const float4_t col1   = float4_mul(_b.col[1], yyyy);
+		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
+		const float4_t col3   = float4_madd(_b.col[3], wwww, col1);
+		const float4_t result = float4_add(col2, col3);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_mul(const float4x4_t& _a, const float4x4_t& _b)
+	{
+		float4x4_t result;
+		result.col[0] = float4_mul(_a.col[0], _b);
+		result.col[1] = float4_mul(_a.col[1], _b);
+		result.col[2] = float4_mul(_a.col[2], _b);
+		result.col[3] = float4_mul(_a.col[3], _b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_transpose(const float4x4_t& _mtx)
+	{
+		const float4_t aibj = float4_shuf_xAyB(_mtx.col[0], _mtx.col[2]); // aibj
+		const float4_t emfn = float4_shuf_xAyB(_mtx.col[1], _mtx.col[3]); // emfn
+		const float4_t ckdl = float4_shuf_zCwD(_mtx.col[0], _mtx.col[2]); // ckdl
+		const float4_t gohp = float4_shuf_zCwD(_mtx.col[1], _mtx.col[3]); // gohp
+		float4x4_t result;
+		result.col[0] = float4_shuf_xAyB(aibj, emfn); // aeim
+		result.col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn
+		result.col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko
+		result.col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_inverse(const float4x4_t& _a)
+	{
+		const float4_t tmp0 = float4_shuf_xAzC(_a.col[0], _a.col[1]);
+		const float4_t tmp1 = float4_shuf_xAzC(_a.col[2], _a.col[3]);
+		const float4_t tmp2 = float4_shuf_yBwD(_a.col[0], _a.col[1]);
+		const float4_t tmp3 = float4_shuf_yBwD(_a.col[2], _a.col[3]);
+		const float4_t t0   = float4_shuf_xyAB(tmp0, tmp1);
+		const float4_t t1   = float4_shuf_xyAB(tmp3, tmp2);
+		const float4_t t2   = float4_shuf_zwCD(tmp0, tmp1);
+		const float4_t t3   = float4_shuf_zwCD(tmp3, tmp2);
+
+		const float4_t t23 = float4_mul(t2, t3);
+		const float4_t t23_yxwz = float4_swiz_yxwz(t23);
+		const float4_t t23_wzyx = float4_swiz_wzyx(t23);
+
+		float4_t cof0, cof1, cof2, cof3;
+
+		const float4_t zero = float4_zero();
+		cof0 = float4_nmsub(t1, t23_yxwz, zero);
+		cof0 = float4_madd(t1, t23_wzyx, cof0);
+
+		cof1 = float4_nmsub(t0, t23_yxwz, zero);
+		cof1 = float4_madd(t0, t23_wzyx, cof1);
+		cof1 = float4_swiz_zwxy(cof1);
+		
+		const float4_t t12 = float4_mul(t1, t2);
+		const float4_t t12_yxwz = float4_swiz_yxwz(t12);
+		const float4_t t12_wzyx = float4_swiz_wzyx(t12);
+		
+		cof0 = float4_madd(t3, t12_yxwz, cof0);
+		cof0 = float4_nmsub(t3, t12_wzyx, cof0);
+
+		cof3 = float4_mul(t0, t12_yxwz);
+		cof3 = float4_nmsub(t0, t12_wzyx, cof3);
+		cof3 = float4_swiz_zwxy(cof3);
+
+		const float4_t t1_zwxy = float4_swiz_zwxy(t1);
+		const float4_t t2_zwxy = float4_swiz_zwxy(t2);
+
+		const float4_t t13 = float4_mul(t1_zwxy, t3);
+		const float4_t t13_yxwz = float4_swiz_yxwz(t13);
+		const float4_t t13_wzyx = float4_swiz_wzyx(t13);
+
+		cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0);
+		cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0);
+
+		cof2 = float4_mul(t0, t13_yxwz);
+		cof2 = float4_nmsub(t0, t13_wzyx, cof2);
+		cof2 = float4_swiz_zwxy(cof2);
+
+		const float4_t t01 = float4_mul(t0, t1);
+		const float4_t t01_yxwz = float4_swiz_yxwz(t01);
+		const float4_t t01_wzyx = float4_swiz_wzyx(t01);
+
+		cof2 = float4_nmsub(t3, t01_yxwz, cof2);
+		cof2 = float4_madd(t3, t01_wzyx, cof2);
+
+		cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3);
+		cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3);
+
+		const float4_t t03 = float4_mul(t0, t3);
+		const float4_t t03_yxwz = float4_swiz_yxwz(t03);
+		const float4_t t03_wzyx = float4_swiz_wzyx(t03);
+
+		cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1);
+		cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1);
+
+		cof2 = float4_madd(t1, t03_yxwz, cof2);
+		cof2 = float4_nmsub(t1, t03_wzyx, cof2);
+
+		const float4_t t02 = float4_mul(t0, t2_zwxy);
+		const float4_t t02_yxwz = float4_swiz_yxwz(t02);
+		const float4_t t02_wzyx = float4_swiz_wzyx(t02);
+
+		cof1 = float4_madd(t3, t02_yxwz, cof1);
+		cof1 = float4_nmsub(t3, t02_wzyx, cof1);
+
+		cof3 = float4_nmsub(t1, t02_yxwz, cof3);
+		cof3 = float4_madd(t1, t02_wzyx, cof3);
+
+		const float4_t det    = float4_dot(t0, cof0);
+		const float4_t invdet = float4_rcp(det);
+
+		float4x4_t result;
+		result.col[0] = float4_mul(cof0, invdet);
+		result.col[1] = float4_mul(cof1, invdet);
+		result.col[2] = float4_mul(cof2, invdet);
+		result.col[3] = float4_mul(cof3, invdet);
+
+		return result;
+	}
+
+} // namespace bx
+
+#endif // __BX_FLOAT4X4_H__
--- a/include/bx/foreach.h
+++ b/include/bx/foreach.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FOREACH_H__
+#define __BX_FOREACH_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	namespace foreach_ns
+	{
+		struct ContainerBase
+		{
+		};
+
+		template <typename Ty>
+		class Container : public ContainerBase
+		{
+		public:
+			inline Container(const Ty& _container)
+				: m_container(_container)
+				, m_break(0)
+				, m_it( _container.begin() )
+				, m_itEnd( _container.end() )
+			{
+			}
+
+			inline bool condition() const
+			{
+				return (!m_break++ && m_it != m_itEnd);
+			}
+
+			const Ty& m_container;
+			mutable int m_break;
+			mutable typename Ty::const_iterator m_it;
+			mutable typename Ty::const_iterator m_itEnd;
+		};
+
+		template <typename Ty>
+		inline Ty* pointer(const Ty&)
+		{
+			return 0;
+		}
+
+		template <typename Ty>
+		inline Container<Ty> containerNew(const Ty& _container)
+		{
+			return Container<Ty>(_container);
+		}
+
+		template <typename Ty>
+		inline const Container<Ty>* container(const ContainerBase* _base, const Ty*)
+		{
+			return static_cast<const Container<Ty>*>(_base);
+		}
+	} // namespace foreach_ns
+
+#define foreach(_variable, _container) \
+	for (const bx::foreach_ns::ContainerBase &__temp_container__ = bx::foreach_ns::containerNew(_container); \
+			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->condition(); \
+			++bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_it) \
+	for (_variable = *container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_it; \
+			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_break; \
+			--bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container))->m_break)
+
+} // namespace bx
+
+#endif // __BX_FOREACH_H__
--- a/include/bx/handlealloc.h
+++ b/include/bx/handlealloc.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_HANDLE_ALLOC_H__
+#define __BX_HANDLE_ALLOC_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	class HandleAlloc
+	{
+	public:
+		static const uint16_t invalid = 0xffff;
+
+		HandleAlloc(uint16_t _maxHandles)
+			: m_dense(new uint16_t[_maxHandles*2])
+			, m_sparse(&m_dense[_maxHandles])
+			, m_numHandles(0)
+			, m_maxHandles(_maxHandles)
+		{
+			for (uint16_t ii = 0; ii < _maxHandles; ++ii)
+			{
+				m_dense[ii] = ii;
+			}
+		}
+
+		~HandleAlloc()
+		{
+			delete [] m_dense;
+		}
+
+		const uint16_t* getHandles() const
+		{
+			return m_dense;
+		}
+
+		uint16_t getNumHandles() const
+		{
+			return m_numHandles;
+		}
+
+		uint16_t getMaxHandles() const
+		{
+			return m_maxHandles;
+		}
+
+		uint16_t alloc()
+		{
+			if (m_numHandles < m_maxHandles)
+			{
+				uint16_t index = m_numHandles;
+				++m_numHandles;
+
+				uint16_t handle = m_dense[index];
+				m_sparse[handle] = index;
+				return handle;
+			}
+
+			return invalid;
+		}
+
+		void free(uint16_t _handle)
+		{
+			uint16_t index = m_sparse[_handle];
+			--m_numHandles;
+			uint16_t temp = m_dense[m_numHandles];
+			m_dense[m_numHandles] = _handle;
+			m_sparse[temp] = index;
+			m_dense[index] = temp;
+		}
+
+	private:
+		uint16_t* m_dense;
+		uint16_t* m_sparse;
+		uint16_t m_numHandles;
+		uint16_t m_maxHandles;
+	};
+} // namespace bx
+
+#endif // __HANDLE_ALLOC_H__
--- a/include/bx/hash.h
+++ b/include/bx/hash.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_HASH_H__
+#define __BX_HASH_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	// MurmurHash2 was written by Austin Appleby, and is placed in the public
+	// domain. The author hereby disclaims copyright to this source code.
+	#define MURMUR_M 0x5bd1e995
+	#define MURMUR_R 24
+
+	#define mmix(_h, _k) { _k *= MURMUR_M; _k ^= _k >> MURMUR_R; _k *= MURMUR_M; _h *= MURMUR_M; _h ^= _k; }
+
+	class HashMurmur2A
+	{
+	public:
+		void begin(uint32_t _seed = 0)
+		{
+			m_hash = _seed;
+			m_tail = 0;
+			m_count = 0;
+			m_size = 0;
+		}
+
+		void add(const void* _data, int _len)
+		{
+			const uint8_t* data = (uint8_t*)_data;
+			m_size += _len;
+
+			mixTail(data, _len);
+
+			while(_len >= 4)
+			{
+				uint32_t kk = *(uint32_t*)data;
+
+				mmix(m_hash, kk);
+
+				data += 4;
+				_len -= 4;
+			}
+
+			mixTail(data, _len);
+		}
+
+		uint32_t end()
+		{
+			mmix(m_hash, m_tail);
+			mmix(m_hash, m_size);
+
+			m_hash ^= m_hash >> 13;
+			m_hash *= MURMUR_M;
+			m_hash ^= m_hash >> 15;
+
+			return m_hash;
+		}
+
+	private:
+		void mixTail(const uint8_t*& _data, int& _len)
+		{
+			while( _len && ((_len<4) || m_count) )
+			{
+				m_tail |= (*_data++) << (m_count * 8);
+
+				m_count++;
+				_len--;
+
+				if(m_count == 4)
+				{
+					mmix(m_hash, m_tail);
+					m_tail = 0;
+					m_count = 0;
+				}
+			}
+		}
+
+		uint32_t m_hash;
+		uint32_t m_tail;
+		uint32_t m_count;
+		uint32_t m_size;
+	};
+
+} // namespace bx
+
+#endif // __BX_HASH_H__
--- a/include/bx/macros.h
+++ b/include/bx/macros.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_MACROS_H__
+#define __BX_MACROS_H__
+
+#include "bx.h"
+
+#define BX_VA_ARGS_COUNT_DETAIL(_a1, _a2, _a3, _a4, _a5, _a6, _a7, _a8, _a9, _a10, _a11, _a12, _a13, _a14, _a15, _a16, _last, ...) _last
+#define BX_VA_ARGS_COUNT(...) BX_VA_ARGS_COUNT_DETAIL(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define BX_MACRO_DISPATCHER_DETAIL1(_func, _argCount) _func ## _argCount
+#define BX_MACRO_DISPATCHER_DETAIL2(_func, _argCount) BX_MACRO_DISPATCHER_DETAIL1(_func, _argCount)
+#define BX_MACRO_DISPATCHER(_func, ...) BX_MACRO_DISPATCHER_DETAIL2(_func, VA_ARGS_COUNT(__VA_ARGS__) )
+
+#define BX_STRINGIZE(_x) BX_STRINGIZE_(_x)
+#define BX_STRINGIZE_(_x) #_x
+
+#define BX_FILE_LINE_LITERAL "" __FILE__ "(" BX_STRINGIZE(__LINE__) "): "
+
+#define BX_ALIGN_MASK(_value, _mask) ( ( (_value)+(_mask) ) & ( (~0)&(~(_mask) ) ) )
+#define BX_ALIGN_16(_value) BX_ALIGN_MASK(_value, 0xf)
+#define BX_ALIGN_256(_value) BX_ALIGN_MASK(_value, 0xff)
+
+#if BX_COMPILER_GCC || BX_COMPILER_CLANG
+#	define BX_ALIGN_STRUCT(_align, _struct) _struct __attribute__( (aligned(_align) ) )
+#	define BX_FUNCTION __PRETTY_FUNCTION__
+#	define BX_NO_INLINE __attribute__( (noinline) )
+#	define BX_FORCE_INLINE __extension__ static __inline __attribute__( (__always_inline__) )
+#	if BX_COMPILER_CLANG
+#		define BX_THREAD /* not supported right now */
+#	else
+#		define BX_THREAD __thread
+#	endif // BX_COMPILER_CLANG
+#elif BX_COMPILER_MSVC
+#	define BX_ALIGN_STRUCT(_align, _struct) __declspec(align(_align) ) _struct
+#	define BX_FUNCTION __FUNCTION__
+#	define BX_NO_INLINE __declspec(noinline)
+#	define BX_FORCE_INLINE __forceinline
+#	define BX_THREAD __declspec(thread)
+#else
+#	error "Unknown BX_COMPILER_?"
+#endif
+
+#define BX_ALIGN_STRUCT_16(_struct) BX_ALIGN_STRUCT(16, _struct)
+#define BX_ALIGN_STRUCT_256(_struct) BX_ALIGN_STRUCT(256, _struct)
+
+#ifndef BX_CHECK
+#	define BX_CHECK(...) do {} while(0)
+#endif // BX_CHECK
+
+#ifndef BX_TRACE
+#	define BX_TRACE(...) do {} while(0)
+#endif // BX_TRACE
+
+#ifndef  BX_CONFIG_SPSCQUEUE_USE_NAIVE
+#	define BX_CONFIG_SPSCQUEUE_USE_NAIVE 0
+#endif // BX_CONFIG_SPSCQUEUE_USE_NAIVE
+
+#endif // __BX_MACROS_H__
--- a/include/bx/maputil.h
+++ b/include/bx/maputil.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_MAPUTIL_H__
+#define __BX_MAPUTIL_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	template<typename MapType>
+	typename MapType::iterator mapInsertOrUpdate(MapType& _map, const typename MapType::key_type& _key, const typename MapType::mapped_type& _value)
+	{
+		typename MapType::iterator it = _map.lower_bound(_key);
+		if (it != _map.end()
+		&&  !_map.key_comp()(_key, it->first) )
+		{
+			it->second = _value;
+			return it;
+		}
+
+		typename MapType::value_type pair(_key, _value);
+		return _map.insert(it, pair);
+	}
+} // namespace bx
+
+#endif // __BX_MAPUTIL_H__
--- a/include/bx/mutex.h
+++ b/include/bx/mutex.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_MUTEX_H__
+#define __BX_MUTEX_H__
+
+#include "bx.h"
+#include "cpu.h"
+#include "sem.h"
+
+#if BX_PLATFORM_NACL || BX_PLATFORM_LINUX || BX_PLATFORM_ANDROID
+#	include <pthread.h>
+#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+#	include <errno.h>
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+	typedef CRITICAL_SECTION pthread_mutex_t;
+	typedef unsigned pthread_mutexattr_t;
+
+	inline int pthread_mutex_lock(pthread_mutex_t* _mutex)
+	{
+		EnterCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_unlock(pthread_mutex_t* _mutex)
+	{
+		LeaveCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_trylock(pthread_mutex_t* _mutex)
+	{
+		return TryEnterCriticalSection(_mutex) ? 0 : EBUSY;
+	}
+
+	inline int pthread_mutex_init(pthread_mutex_t* _mutex, pthread_mutexattr_t* /*_attr*/)
+	{
+		InitializeCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_destroy(pthread_mutex_t* _mutex)
+	{
+		DeleteCriticalSection(_mutex);
+		return 0;
+	}
+#endif // BX_PLATFORM_
+
+	class Mutex
+	{
+	public:
+		Mutex()
+		{
+			pthread_mutex_init(&m_handle, NULL);
+		}
+
+		~Mutex()
+		{
+			pthread_mutex_destroy(&m_handle);
+		}
+
+		void lock()
+		{
+			pthread_mutex_lock(&m_handle);
+		}
+
+		void unlock()
+		{
+			pthread_mutex_unlock(&m_handle);
+		}
+
+	private:
+		Mutex(const Mutex& _rhs); // no copy constructor
+		Mutex& operator=(const Mutex& _rhs); // no assignment operator
+
+		pthread_mutex_t m_handle;
+	};
+
+	class MutexScope
+	{
+	public:
+		MutexScope(Mutex& _mutex)
+			: m_mutex(_mutex)
+		{
+			m_mutex.lock();
+		}
+
+		~MutexScope()
+		{
+			m_mutex.unlock();
+		}
+
+	private:
+		MutexScope(); // no default constructor
+		MutexScope(const MutexScope& _rhs); // no copy constructor
+		MutexScope& operator=(const MutexScope& _rhs); // no assignment operator
+
+		Mutex& m_mutex;
+	};
+
+#if 1
+	typedef Mutex LwMutex;
+#else
+	class LwMutex
+	{
+	public:
+		LwMutex()
+			: m_count(0)
+		{
+		}
+
+		~LwMutex()
+		{
+		}
+
+		void lock()
+		{
+			if (atomicIncr(&m_count) > 1)
+			{
+				m_sem.wait();
+			}
+		}
+
+		void unlock()
+		{
+			if (atomicDecr(&m_count) > 0)
+			{
+				m_sem.post();
+			}
+		}
+
+	private:
+		LwMutex(const LwMutex& _rhs); // no copy constructor
+		LwMutex& operator=(const LwMutex& _rhs); // no assignment operator
+
+		Semaphore m_sem;
+		volatile int32_t m_count;
+	};
+#endif // 0
+
+	class LwMutexScope
+	{
+	public:
+		LwMutexScope(LwMutex& _mutex)
+			: m_mutex(_mutex)
+		{
+			m_mutex.lock();
+		}
+
+		~LwMutexScope()
+		{
+			m_mutex.unlock();
+		}
+
+	private:
+		LwMutexScope(); // no default constructor
+		LwMutexScope(const LwMutexScope& _rhs); // no copy constructor
+		LwMutexScope& operator=(const LwMutexScope& _rhs); // no assignment operator
+
+		LwMutex& m_mutex;
+	};
+
+} // namespace bx
+
+#endif // __BX_MUTEX_H__
--- a/include/bx/os.h
+++ b/include/bx/os.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_OS_H__
+#define __BX_OS_H__
+
+#include "bx.h"
+
+#if BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX
+#	include <sched.h> // sched_yield
+#	if BX_PLATFORM_NACL
+#		include <sys/nacl_syscalls.h> // nanosleep
+#	else
+#		include <time.h> // nanosleep
+#	endif // BX_PLATFORM_NACL
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+	inline void sleep(uint32_t _ms)
+	{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+		Sleep(_ms);
+#else
+		timespec req = {_ms/1000, (_ms%1000)*1000000};
+		timespec rem = {0, 0};
+		nanosleep(&req, &rem);
+#endif // BX_PLATFORM_
+	}
+
+	inline void yield()
+	{
+#if BX_PLATFORM_WINDOWS
+		SwitchToThread();
+#elif BX_PLATFORM_XBOX360
+		Sleep(0);
+#else
+		sched_yield();
+#endif // BX_PLATFORM_
+	}
+
+} // namespace bx
+
+#endif // __BX_OS_H__
--- a/include/bx/platform.h
+++ b/include/bx/platform.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_PLATFORM_H__
+#define __BX_PLATFORM_H__
+
+#define BX_COMPILER_CLANG 0
+#define BX_COMPILER_GCC 0
+#define BX_COMPILER_MSVC 0
+
+#define BX_PLATFORM_ANDROID 0
+#define BX_PLATFORM_LINUX 0
+#define BX_PLATFORM_NACL 0
+#define BX_PLATFORM_WINDOWS 0
+#define BX_PLATFORM_XBOX360 0
+
+#define BX_CPU_ARM 0
+#define BX_CPU_PPC 0
+#define BX_CPU_X86 0
+
+#define BX_CPU_ENDIAN_BIG 0
+#define BX_CPU_ENDIAN_LITTLE 0
+
+// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Compilers
+#if defined(_MSC_VER)
+#	undef BX_COMPILER_MSVC
+#	define BX_COMPILER_MSVC 1
+#elif defined(__clang__)
+// clang defines __GNUC__
+#	undef BX_COMPILER_CLANG
+#	define BX_COMPILER_CLANG 1
+#elif defined(__GNUC__)
+#	undef BX_COMPILER_GCC
+#	define BX_COMPILER_GCC 1
+#else
+#	error "BX_COMPILER_* is not defined!"
+#endif //
+
+// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Operating_Systems
+#if defined(_XBOX_VER)
+#	undef BX_PLATFORM_XBOX360
+#	define BX_PLATFORM_XBOX360 1
+#elif defined(_WIN32) || defined(_WIN64)
+#	undef BX_PLATFORM_WINDOWS
+#	define BX_PLATFORM_WINDOWS 1
+#elif defined(__native_client__)
+// NaCl compiler defines __linux__
+#	undef BX_PLATFORM_NACL
+#	define BX_PLATFORM_NACL 1
+#elif defined(__ANDROID__)
+// Android compiler defines __linux__
+#	undef BX_PLATFORM_ANDROID
+#	define BX_PLATFORM_ANDROID 1
+#elif defined(__linux__)
+#	undef BX_PLATFORM_LINUX
+#	define BX_PLATFORM_LINUX 1
+#else
+#	error "BX_PLATFORM_* is not defined!"
+#endif //
+
+// http://sourceforge.net/apps/mediawiki/predef/index.php?title=Architectures
+#if defined(__arm__)
+#	undef BX_CPU_ARM
+#	define BX_CPU_ARM 1
+#	define BX_CACHE_LINE_SIZE 64
+#elif defined(_M_PPC) || defined(__powerpc__) || defined(__powerpc64__)
+#	undef BX_CPU_PPC
+#	define BX_CPU_PPC 1
+#	define BX_CACHE_LINE_SIZE 128
+#elif defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
+#	undef BX_CPU_X86
+#	define BX_CPU_X86 1
+#	define BX_CACHE_LINE_SIZE 64
+#endif // 
+
+#if BX_CPU_PPC
+#	undef BX_CPU_ENDIAN_BIG
+#	define BX_CPU_ENDIAN_BIG 1
+#else
+#	undef BX_CPU_ENDIAN_LITTLE
+#	define BX_CPU_ENDIAN_LITTLE 1
+#endif // BX_PLATFORM_
+
+#endif // __BX_PLATFORM_H__
--- a/include/bx/ringbuffer.h
+++ b/include/bx/ringbuffer.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_RINGBUFFER_H__
+#define __BX_RINGBUFFER_H__
+
+#include "bx.h"
+#include "cpu.h"
+#include "uint32_t.h"
+
+namespace bx
+{
+	class RingBufferControl
+	{
+	public:
+		RingBufferControl(uint32_t _size)
+			: m_size(_size)
+			, m_current(0)
+			, m_write(0)
+			, m_read(0)
+		{
+		}
+
+		~RingBufferControl()
+		{
+		}
+
+		uint32_t available() const
+		{
+			return distance(m_read, m_current);
+		}
+
+		uint32_t consume(uint32_t _size) // consumer only
+		{
+			const uint32_t maxSize    = distance(m_read, m_current);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_read, size);
+			const uint32_t read       = uint32_mod(advance, m_size);
+			m_read = read;
+			return size;
+		}
+
+		uint32_t reserve(uint32_t _size) // producer only
+		{
+			const uint32_t dist       = distance(m_write, m_read)-1;
+			const uint32_t maxSize    = uint32_sels(dist, m_size-1, dist);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_write, size);
+			const uint32_t write      = uint32_mod(advance, m_size);
+			m_write = write;
+			return size;
+		}
+
+		uint32_t commit(uint32_t _size) // producer only
+		{
+			const uint32_t maxSize    = distance(m_current, m_write);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_current, size);
+			const uint32_t current    = uint32_mod(advance, m_size);
+			m_current = current;
+			return size;
+		}
+
+		uint32_t distance(uint32_t _from, uint32_t _to) const // both
+		{
+			const uint32_t diff   = uint32_sub(_to, _from);
+			const uint32_t le     = uint32_add(m_size, diff);
+			const uint32_t result = uint32_sels(diff, le, diff);
+
+			return result;
+		}
+
+		const uint32_t m_size;
+		uint32_t m_current;
+		uint32_t m_write;
+		uint32_t m_read;
+	};
+
+	class SpScRingBufferControl
+	{
+	public:
+		SpScRingBufferControl(uint32_t _size)
+			: m_size(_size)
+			, m_current(0)
+			, m_write(0)
+			, m_read(0)
+		{
+		}
+
+		~SpScRingBufferControl()
+		{
+		}
+
+		uint32_t available() const
+		{
+			return distance(m_read, m_current);
+		}
+
+		uint32_t consume(uint32_t _size) // consumer only
+		{
+			const uint32_t maxSize    = distance(m_read, m_current);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_read, size);
+			const uint32_t read       = uint32_mod(advance, m_size);
+			m_read = read;
+			return size;
+		}
+
+		uint32_t reserve(uint32_t _size) // producer only
+		{
+			const uint32_t dist       = distance(m_write, m_read)-1;
+			const uint32_t maxSize    = uint32_sels(dist, m_size-1, dist);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_write, size);
+			const uint32_t write      = uint32_mod(advance, m_size);
+			m_write = write;
+			return size;
+		}
+
+		uint32_t commit(uint32_t _size) // producer only
+		{
+			const uint32_t maxSize    = distance(m_current, m_write);
+			const uint32_t sizeNoSign = uint32_and(_size, 0x7FFFFFFF);
+			const uint32_t test       = uint32_sub(sizeNoSign, maxSize);
+			const uint32_t size       = uint32_sels(test, _size, maxSize);
+			const uint32_t advance    = uint32_add(m_current, size);
+			const uint32_t current    = uint32_mod(advance, m_size);
+
+			// must commit all memory writes before moving m_current pointer
+			// once m_current pointer moves data is used by consumer thread
+			memoryBarrier();
+			m_current = current;
+			return size;
+		}
+
+		uint32_t distance(uint32_t _from, uint32_t _to) const // both
+		{
+			const uint32_t diff   = uint32_sub(_to, _from);
+			const uint32_t le     = uint32_add(m_size, diff);
+			const uint32_t result = uint32_sels(diff, le, diff);
+
+			return result;
+		}
+
+		const uint32_t m_size;
+		uint32_t m_current;
+		uint32_t m_write;
+		uint32_t m_read;
+	};
+
+	template <typename Control>
+	class ReadRingBufferT
+	{
+	public:
+		ReadRingBufferT(Control& _control, const char* _buffer, uint32_t _size)
+			: m_control(_control)
+			, m_read(_control.m_read)
+			, m_end(m_read+_size)
+			, m_size(_size)
+			, m_buffer(_buffer)
+		{
+			BX_CHECK(_control.available() >= _size, "%d >= %d", _control.available(), _size);
+		}
+
+		~ReadRingBufferT()
+		{
+		}
+
+		void end()
+		{
+			m_control.consume(m_size);
+		}
+
+		void read(char* _data, uint32_t _len)
+		{
+			const uint32_t end = (m_read + _len) % m_control.m_size;
+			uint32_t wrap = 0;
+			const char* from = &m_buffer[m_read];
+
+			if (end < m_read)
+			{
+				wrap = m_control.m_size - m_read;
+				memcpy(_data, from, wrap);
+				_data += wrap;
+				from = (const char*)&m_buffer[0];
+			}
+
+			memcpy(_data, from, _len-wrap);
+
+			m_read = end;
+		}
+
+		void skip(uint32_t _len)
+		{
+			m_read += _len;
+			m_read %= m_control.m_size;
+		}
+
+	private:
+		template <typename Ty>
+		friend class WriteRingBufferT;
+
+		ReadRingBufferT();
+		ReadRingBufferT(const Control&);
+		void operator=(const Control&);
+
+		Control& m_control;
+		uint32_t m_read;
+		uint32_t m_end;
+		const uint32_t m_size;
+		const char* m_buffer;
+	};
+
+	typedef ReadRingBufferT<RingBufferControl> ReadRingBuffer;
+	typedef ReadRingBufferT<SpScRingBufferControl> SpScReadRingBuffer;
+
+	template <typename Control>
+	class WriteRingBufferT
+	{
+	public:
+		WriteRingBufferT(Control& _control, char* _buffer, uint32_t _size)
+			: m_control(_control)
+			, m_size(_size)
+			, m_buffer(_buffer)
+		{
+			uint32_t size = m_control.reserve(_size);
+			BX_CHECK(size == _size, "%d == %d", size, _size);
+			m_write = m_control.m_current;
+			m_end = m_write+_size;
+		}
+
+		~WriteRingBufferT()
+		{
+		}
+
+		void end()
+		{
+			m_control.commit(m_size);
+		}
+
+		void write(const char* _data, uint32_t _len)
+		{
+			const uint32_t end = (m_write + _len) % m_control.m_size;
+			uint32_t wrap = 0;
+			char* to = &m_buffer[m_write];
+
+			if (end < m_write)
+			{
+				wrap = m_control.m_size - m_write;
+				memcpy(to, _data, wrap);
+				_data += wrap;
+				to = (char*)&m_buffer[0];
+			}
+
+			memcpy(to, _data, _len-wrap);
+
+			m_write = end;
+		}
+
+		void write(ReadRingBufferT<Control>& _read, uint32_t _len)
+		{
+			const uint32_t end = (_read.m_read + _len) % _read.m_control.m_size;
+			uint32_t wrap = 0;
+			const char* from = &_read.m_buffer[_read.m_read];
+
+			if (end < _read.m_read)
+			{
+				wrap = _read.m_control.m_size - _read.m_read;
+				write(from, wrap);
+				from = (const char*)&_read.m_buffer[0];
+			}
+
+			write(from, _len-wrap);
+
+			_read.m_read = end;
+		}
+
+		void skip(uint32_t _len)
+		{
+			m_write += _len;
+			m_write %= m_control.m_size;
+		}
+
+	private:
+		WriteRingBufferT();
+		WriteRingBufferT(const WriteRingBufferT<Control>&);
+		void operator=(const WriteRingBufferT<Control>&);
+
+		Control& m_control;
+		uint32_t m_write;
+		uint32_t m_end;
+		const uint32_t m_size;
+		char* m_buffer;
+	};
+
+	typedef WriteRingBufferT<RingBufferControl> WriteRingBuffer;
+	typedef WriteRingBufferT<SpScRingBufferControl> SpScWriteRingBuffer;
+
+} // namespace bx
+
+#endif // __BX_RINGBUFFER_H__
--- a/include/bx/rng.h
+++ b/include/bx/rng.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_RNG_H__
+#define __BX_RNG_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	// George Marsaglia's MWC
+	class RngMwc
+	{
+		public:
+			RngMwc(uint32_t _z = 12345, uint32_t _w = 65435)
+			: m_z(_z)
+			  , m_w(_w)
+		{
+		}
+
+		void reset(uint32_t _z = 12345, uint32_t _w = 65435)
+		{
+			m_z = _z;
+			m_w = _w;
+		}
+
+		uint32_t gen()
+		{
+			m_z = 36969*(m_z&65535)+(m_z>>16);
+			m_w = 18000*(m_w&65535)+(m_w>>16);
+			return (m_z<<16)+m_w;
+		}
+
+	private:
+		uint32_t m_z;
+		uint32_t m_w;
+	};
+
+	// George Marsaglia's FIB
+	class RngFib
+	{
+	public:
+		RngFib()
+			: m_a(9983651)
+			  , m_b(95746118)
+		{
+		}
+
+		void reset()
+		{
+			m_a = 9983651;
+			m_b = 95746118;
+		}
+
+		uint32_t gen()
+		{
+			m_b = m_a+m_b;
+			m_a = m_b-m_a;
+			return m_a;
+		}
+
+	private:
+		uint32_t m_a;
+		uint32_t m_b;
+	};
+
+	// George Marsaglia's SHR3
+	class RngShr3
+	{
+	public:
+		RngShr3(uint32_t _jsr = 34221)
+			: m_jsr(_jsr)
+		{
+		}
+
+		void reset(uint32_t _jsr = 34221)
+		{
+			m_jsr = _jsr;
+		}
+
+		uint32_t gen()
+		{
+			m_jsr ^= m_jsr<<17;
+			m_jsr ^= m_jsr>>13;
+			m_jsr ^= m_jsr<<5;
+			return m_jsr;
+		}
+
+	private:
+		uint32_t m_jsr;
+	};
+
+} // namespace bx
+
+#endif // __BX_RNG_H__
--- a/include/bx/sem.h
+++ b/include/bx/sem.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_SEM_H__
+#define __BX_SEM_H__
+
+#include "bx.h"
+
+#define BX_SEM_CONFIG_POSIX (BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX)
+
+#if BX_SEM_CONFIG_POSIX
+#	include <semaphore.h>
+#	include <time.h>
+#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+#	include <limits.h>
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+#if BX_SEM_CONFIG_POSIX
+	class Semaphore
+	{
+	public:
+		Semaphore()
+		{
+			sem_init(&m_handle, 0, 0);
+		}
+
+		~Semaphore()
+		{
+			sem_destroy(&m_handle);
+		}
+
+		void post(uint32_t _count = 1)
+		{
+			for (uint32_t ii = 0; ii < _count; ++ii)
+			{
+				sem_post(&m_handle);
+			}
+		}
+
+		bool wait(int32_t _msecs = -1)
+		{
+#if BX_PLATFORM_NACL
+			BX_CHECK(-1 == _msecs, "NaCl doesn't support sem_timedwait at this moment.");
+			return 0 == sem_wait(&m_handle);
+#else
+			if (0 > _msecs)
+			{
+				return 0 == sem_wait(&m_handle);
+			}
+
+			timespec ts;
+			ts.tv_sec = _msecs/1000;
+			ts.tv_nsec = (_msecs%1000)*1000;
+			return 0 == sem_timedwait(&m_handle, &ts);
+#endif // BX_PLATFORM_
+		}
+
+	private:
+		Semaphore(const Semaphore& _rhs); // no copy constructor
+		Semaphore& operator=(const Semaphore& _rhs); // no assignment operator
+
+		sem_t m_handle;
+	};
+
+#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+
+	class Semaphore
+	{
+	public:
+		Semaphore()
+		{
+			m_handle = CreateSemaphore(NULL, 0, LONG_MAX, NULL);
+			BX_CHECK(NULL != m_handle, "Failed to create Semaphore!");
+		}
+
+		~Semaphore()
+		{
+			CloseHandle(m_handle);
+		}
+
+		void post(uint32_t _count = 1) const
+		{
+			ReleaseSemaphore(m_handle, _count, NULL);
+		}
+
+		bool wait(int32_t _msecs = -1) const
+		{
+			DWORD milliseconds = (0 > _msecs) ? INFINITE : _msecs;
+			return WAIT_OBJECT_0 == WaitForSingleObject(m_handle, milliseconds);
+		}
+
+	private:
+		Semaphore(const Semaphore& _rhs); // no copy constructor
+		Semaphore& operator=(const Semaphore& _rhs); // no assignment operator
+
+		HANDLE m_handle;
+	};
+
+#endif // BX_PLATFORM_
+
+} // namespace bx
+
+#endif // __BX_SEM_H__
--- a/include/bx/spscqueue.h
+++ b/include/bx/spscqueue.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_SPSCQUEUE_H__
+#define __BX_SPSCQUEUE_H__
+
+#include <list>
+
+#include "bx.h"
+#include "cpu.h"
+#include "mutex.h"
+#include "uint32_t.h"
+
+namespace bx
+{
+	// http://drdobbs.com/article/print?articleId=210604448&siteSectionName=
+	template <typename Ty>
+	class SpScUnboundedQueueOptimized
+	{
+	public:
+		SpScUnboundedQueueOptimized()
+			: m_first(new Node(NULL) )
+			, m_divider(m_first)
+			, m_last(m_first)
+		{
+		}
+
+		~SpScUnboundedQueueOptimized()
+		{
+			while (NULL != m_first)
+			{
+				Node* node = m_first;
+				m_first = node->m_next;
+				delete node;
+			}
+		}
+
+		void push(Ty* _ptr) // producer only
+		{
+			m_last->m_next = new Node(_ptr);
+			atomicExchangePtr((void**)&m_last, m_last->m_next);
+			while (m_first != m_divider)
+			{
+				Node* node = m_first;
+				m_first = m_first->m_next;
+				delete node;
+			}
+		}
+
+		Ty* peek() // consumer only
+		{
+			if (m_divider != m_last)
+			{
+				Ty* ptr = m_divider->m_next->m_ptr;
+				return ptr;
+			}
+
+			return NULL;
+		}
+
+		Ty* pop() // consumer only
+		{
+			if (m_divider != m_last)
+			{
+				Ty* ptr = m_divider->m_next->m_ptr;
+				atomicExchangePtr((void**)&m_divider, m_divider->m_next);
+				return ptr;
+			}
+
+			return NULL;
+		}
+
+	private:
+		SpScUnboundedQueueOptimized(const SpScUnboundedQueueOptimized& _rhs); // no copy constructor
+		SpScUnboundedQueueOptimized& operator=(const SpScUnboundedQueueOptimized& _rhs); // no assignment operator
+
+		struct Node
+		{
+			Node(Ty* _ptr)
+				: m_ptr(_ptr)
+				, m_next(NULL)
+			{
+			}
+
+			Ty* m_ptr;
+			Node* m_next;
+		};
+
+		Node* m_first;
+		Node* m_divider;
+		Node* m_last;
+	};
+
+	template<typename Ty>
+	class SpScUnboundedQueueNaive
+	{
+	public:
+		SpScUnboundedQueueNaive()
+		{
+		}
+
+		~SpScUnboundedQueueNaive()
+		{
+			BX_CHECK(m_queue.empty(), "Queue is not empty!");
+		}
+
+		void push(Ty* _item)
+		{
+			bx::LwMutexScope lock(m_mutex);
+			m_queue.push_back(_item);
+		}
+
+		Ty* peek()
+		{
+			bx::LwMutexScope lock(m_mutex);
+			if (!m_queue.empty() )
+			{
+				return m_queue.front();
+			}
+
+			return NULL;
+		}
+
+		Ty* pop()
+		{
+			bx::LwMutexScope lock(m_mutex);
+			if (!m_queue.empty() )
+			{
+				Ty* item = m_queue.front();
+				m_queue.pop_front();
+				return item;
+			}
+
+			return NULL;
+		}
+
+	private:
+		bx::LwMutex m_mutex;
+		std::list<Ty*> m_queue;
+	};
+
+#if BX_CONFIG_SPSCQUEUE_USE_NAIVE
+#	define SpScUnboundedQueue SpScUnboundedQueueNaive
+#else
+#	define SpScUnboundedQueue SpScUnboundedQueueOptimized
+#endif // BX_CONFIG_NAIVE
+
+} // namespace bx
+
+#endif // __BX_RINGBUFFER_H__
--- a/include/bx/timer.h
+++ b/include/bx/timer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2010-2011 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_TIMER_H__
+#define __BX_TIMER_H__
+
+#include "bx.h"
+
+#if BX_PLATFORM_ANDROID
+#	include <time.h> // clock, clock_gettime
+#elif BX_PLATFORM_NACL | BX_PLATFORM_LINUX
+#	include <sys/time.h> // gettimeofday
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+	inline int64_t getHPCounter()
+	{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+		LARGE_INTEGER li;
+		// Performance counter value may unexpectedly leap forward
+		// http://support.microsoft.com/kb/274323
+		QueryPerformanceCounter(&li);
+		int64_t i64 = li.QuadPart;
+#elif BX_PLATFORM_ANDROID
+		int64_t i64 = clock();
+#else
+		struct timeval now;
+		gettimeofday(&now, 0);
+		int64_t i64 = now.tv_sec*1000000 + now.tv_usec;
+#endif // BNET_PLATFORM_
+		static int64_t offset = i64;
+		return i64 - offset;
+	}
+
+	inline int64_t getHPFrequency()
+	{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+		LARGE_INTEGER li;
+		QueryPerformanceFrequency(&li);
+		return li.QuadPart;
+#elif BX_PLATFORM_ANDROID
+		return CLOCKS_PER_SEC;
+#else
+		return 1000000;
+#endif // BNET_PLATFORM_
+	}
+
+} // namespace bx
+
+#endif // __BX_TIMER_H__
--- a/include/bx/uint32_t.h
+++ b/include/bx/uint32_t.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+// Copyright 2006 Mike Acton <macton@gmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE
+
+#ifndef __BX_UINT32_T_H__
+#define __BX_UINT32_T_H__
+
+#include "bx.h"
+
+#if BX_COMPILER_MSVC
+#	if BX_PLATFORM_WINDOWS
+#		include <math.h> // math.h is included because VS bitches:
+						 // warning C4985: 'ceil': attributes not present on previous declaration.
+						 // must be included before intrin.h.
+#		include <intrin.h>
+#		pragma intrinsic(_BitScanForward)
+#		pragma intrinsic(_BitScanReverse)
+#	endif // BX_PLATFORM_WINDOWS
+#endif // BX_COMPILER_MSVC
+
+namespace bx
+{
+	inline uint32_t uint32_li(uint32_t _a)
+	{
+		return _a;
+	}
+
+	inline uint32_t uint32_dec(uint32_t _a)
+	{
+		return _a - 1;
+	}
+
+	inline uint32_t uint32_inc(uint32_t _a)
+	{
+		return _a + 1;
+	}
+
+	inline uint32_t uint32_not(uint32_t _a)
+	{
+		return ~_a;
+	}
+
+	inline uint32_t uint32_neg(uint32_t _a)
+	{
+		return -(int32_t)_a;
+	}
+
+	inline uint32_t uint32_ext(uint32_t _a)
+	{
+		return ( (int32_t)_a)>>31;
+	}
+
+	inline uint32_t uint32_and(uint32_t _a, uint32_t _b)
+	{
+		return _a & _b;
+	}
+
+	inline uint32_t uint32_xor(uint32_t _a, uint32_t _b)
+	{
+		return _a ^ _b;
+	}
+
+	inline uint32_t uint32_xorl(uint32_t _a, uint32_t _b)
+	{
+		return !_a != !_b;
+	}
+
+	inline uint32_t uint32_andc(uint32_t _a, uint32_t _b)
+	{
+		return _a & ~_b;
+	}
+
+	inline uint32_t uint32_or(uint32_t _a, uint32_t _b)
+	{
+		return _a | _b;
+	}
+
+	inline uint32_t uint32_sll(uint32_t _a, int _sa)
+	{
+		return _a << _sa;
+	}
+
+	inline uint32_t uint32_srl(uint32_t _a, int _sa)
+	{
+		return _a >> _sa;
+	}
+
+	inline uint32_t uint32_sra(uint32_t _a, int _sa)
+	{
+		return ( (int32_t)_a) >> _sa;
+	}
+
+	inline uint32_t uint32_rol(uint32_t _a, int _sa)
+	{
+		return ( _a << _sa) | (_a >> (32-_sa) );
+	}
+
+	inline uint32_t uint32_ror(uint32_t _a, int _sa)
+	{
+		return ( _a >> _sa) | (_a << (32-_sa) );
+	}
+
+	inline uint32_t uint32_add(uint32_t _a, uint32_t _b)
+	{
+		return _a + _b;
+	}
+
+	inline uint32_t uint32_sub(uint32_t _a, uint32_t _b)
+	{
+		return _a - _b;
+	}
+
+	inline uint32_t uint32_mul(uint32_t _a, uint32_t _b)
+	{
+		return _a * _b;
+	}
+
+	inline uint32_t uint32_div(uint32_t _a, uint32_t _b)
+	{
+		return (_a / _b);
+	}
+
+	inline uint32_t uint32_mod(uint32_t _a, uint32_t _b)
+	{
+		return (_a % _b);
+	}
+
+	inline uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b)
+	{
+		return -(_a == _b);
+	}
+
+	inline uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b)
+	{
+		return -(_a != _b);
+	}
+
+	inline uint32_t uint32_cmplt(uint32_t _a, uint32_t _b)
+	{
+		return -(_a < _b);
+	}
+
+	inline uint32_t uint32_cmple(uint32_t _a, uint32_t _b)
+	{
+		return -(_a <= _b);
+	}
+
+	inline uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b)
+	{
+		return -(_a > _b);
+	}
+
+	inline uint32_t uint32_cmpge(uint32_t _a, uint32_t _b)
+	{
+		return -(_a >= _b);
+	}
+
+	inline uint32_t uint32_setnz(uint32_t _a)
+	{
+		return -!!_a;
+	}
+
+	inline uint32_t uint32_satadd(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t add    = uint32_add(_a, _b);
+		const uint32_t lt     = uint32_cmplt(add, _a);
+		const uint32_t result = uint32_or(add, lt);
+
+		return result;
+	}
+
+	inline uint32_t uint32_satsub(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t sub    = uint32_sub(_a, _b);
+		const uint32_t le     = uint32_cmple(sub, _a);
+		const uint32_t result = uint32_and(sub, le);
+
+		return result;
+	}
+
+	inline uint32_t uint32_satmul(uint32_t _a, uint32_t _b)
+	{
+		const uint64_t mul    = (uint64_t)_a * (uint64_t)_b;
+		const uint32_t hi     = mul >> 32;
+		const uint32_t nz     = uint32_setnz(hi);
+		const uint32_t result = uint32_or(uint32_t(mul), nz);
+
+		return result;
+	}
+
+	inline uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b)
+	{
+		const uint32_t mask   = uint32_ext(test);
+		const uint32_t sel_a  = uint32_and(_a, mask);
+		const uint32_t sel_b  = uint32_andc(_b, mask);
+		const uint32_t result = uint32_or(sel_a, sel_b);
+
+		return (result);
+	}
+
+	inline uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b)
+	{
+		const uint32_t sel_a  = uint32_and(_a, _mask);
+		const uint32_t sel_b  = uint32_andc(_b, _mask);
+		const uint32_t result = uint32_or(sel_a, sel_b);
+
+		return (result);
+	}
+
+	inline uint32_t uint32_imin(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t a_sub_b = uint32_sub(_a, _b);
+		const uint32_t result  = uint32_sels(a_sub_b, _a, _b);
+
+		return result;
+	}
+
+	inline uint32_t uint32_imax(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t b_sub_a = uint32_sub(_b, _a);
+		const uint32_t result  = uint32_sels(b_sub_a, _a, _b);
+
+		return result;
+	}
+
+	inline uint32_t uint32_min(uint32_t _a, uint32_t _b)
+	{
+		return _a > _b ? _b : _a;
+	}
+
+	inline uint32_t uint32_max(uint32_t _a, uint32_t _b)
+	{
+		return _a > _b ? _a : _b;
+	}
+
+	inline uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max)
+	{
+		const uint32_t inc          = uint32_inc(_val);
+		const uint32_t max_diff     = uint32_sub(_max, _val);
+		const uint32_t neg_max_diff = uint32_neg(max_diff);
+		const uint32_t max_or       = uint32_or(max_diff, neg_max_diff);
+		const uint32_t max_diff_nz  = uint32_ext(max_or);
+		const uint32_t result       = uint32_selb(max_diff_nz, inc, _min);
+
+		return result;
+	}
+
+	inline uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max)
+	{
+		const uint32_t dec          = uint32_dec(_val);
+		const uint32_t min_diff     = uint32_sub(_min, _val);
+		const uint32_t neg_min_diff = uint32_neg(min_diff);
+		const uint32_t min_or       = uint32_or(min_diff, neg_min_diff);
+		const uint32_t min_diff_nz  = uint32_ext(min_or);
+		const uint32_t result       = uint32_selb(min_diff_nz, dec, _max);
+
+		return result;
+	}
+
+	inline uint32_t uint32_cntbits_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_srl(_val, 1);
+		const uint32_t tmp1   = uint32_and(tmp0, 0x55555555);
+		const uint32_t tmp2   = uint32_sub(_val, tmp1);
+		const uint32_t tmp3   = uint32_and(tmp2, 0xc30c30c3);
+		const uint32_t tmp4   = uint32_srl(tmp2, 2);
+		const uint32_t tmp5   = uint32_and(tmp4, 0xc30c30c3);
+		const uint32_t tmp6   = uint32_srl(tmp2, 4);
+		const uint32_t tmp7   = uint32_and(tmp6, 0xc30c30c3);
+		const uint32_t tmp8   = uint32_add(tmp3, tmp5);
+		const uint32_t tmp9   = uint32_add(tmp7, tmp8);
+		const uint32_t tmpA   = uint32_srl(tmp9, 6);
+		const uint32_t tmpB   = uint32_add(tmp9, tmpA);
+		const uint32_t tmpC   = uint32_srl(tmpB, 12);
+		const uint32_t tmpD   = uint32_srl(tmpB, 24);
+		const uint32_t tmpE   = uint32_add(tmpB, tmpC);
+		const uint32_t tmpF   = uint32_add(tmpD, tmpE);
+		const uint32_t result = uint32_and(tmpF, 0x3f);
+
+		return result;
+	}
+
+	/// Count number of bits set.
+	inline uint32_t uint32_cntbits(uint32_t _val)
+	{
+#if BX_COMPILER_GCC
+		return __builtin_popcount(_val);
+#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		return __popcnt(_val);
+#else
+		return uint32_cntbits_ref(_val);
+#endif // BX_COMPILER_GCC
+	}
+
+	inline uint32_t uint32_cntlz_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_srl(_val, 1);
+		const uint32_t tmp1   = uint32_or(tmp0, _val);
+		const uint32_t tmp2   = uint32_srl(tmp1, 2);
+		const uint32_t tmp3   = uint32_or(tmp2, tmp1);
+		const uint32_t tmp4   = uint32_srl(tmp3, 4);
+		const uint32_t tmp5   = uint32_or(tmp4, tmp3);
+		const uint32_t tmp6   = uint32_srl(tmp5, 8);
+		const uint32_t tmp7   = uint32_or(tmp6, tmp5);
+		const uint32_t tmp8   = uint32_srl(tmp7, 16);
+		const uint32_t tmp9   = uint32_or(tmp8, tmp7);
+		const uint32_t tmpA   = uint32_not(tmp9);
+		const uint32_t result = uint32_cntbits(tmpA);
+
+		return result;
+	}
+
+	/// Count number of leading zeros.
+	inline uint32_t uint32_cntlz(uint32_t _val)
+	{
+#if BX_COMPILER_GCC
+		return __builtin_clz(_val);
+#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		unsigned long index;
+		_BitScanReverse(&index, _val);
+		return 31 - index;
+#else
+		return uint32_cntlz_ref(_val);
+#endif // BX_COMPILER_
+	}
+
+	inline uint32_t uint32_cnttz_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_not(_val);
+		const uint32_t tmp1   = uint32_dec(_val);
+		const uint32_t tmp2   = uint32_and(tmp0, tmp1);
+		const uint32_t result = uint32_cntbits(tmp2);
+
+		return result;
+	}
+
+	inline uint32_t uint32_cnttz(uint32_t _val)
+	{
+#if BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		unsigned long index;
+		_BitScanForward(&index, _val);
+		return index;
+#else
+		return uint32_cnttz_ref(_val);
+#endif // BX_COMPILER_
+	}
+
+	// shuffle:
+	// ---- ---- ---- ---- fedc ba98 7654 3210
+	// to:
+	// -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	inline uint32_t uint32_part1by1(uint32_t _a)
+	{
+		const uint32_t val    = uint32_and(_a, 0xffff);
+
+		const uint32_t tmp0   = uint32_sll(val, 8);
+		const uint32_t tmp1   = uint32_xor(val, tmp0);
+		const uint32_t tmp2   = uint32_and(tmp1, 0x00ff00ff);
+
+		const uint32_t tmp3   = uint32_sll(tmp2, 4);
+		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_and(tmp4, 0x0f0f0f0f);
+
+		const uint32_t tmp6   = uint32_sll(tmp5, 2);
+		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
+		const uint32_t tmp8   = uint32_and(tmp7, 0x33333333);
+
+		const uint32_t tmp9   = uint32_sll(tmp8, 1);
+		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
+		const uint32_t result = uint32_and(tmpA, 0x55555555);
+
+		return result;
+	}
+
+	// shuffle:
+	// ---- ---- ---- ---- ---- --98 7654 3210
+	// to:
+	// ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	inline uint32_t uint32_part1by2(uint32_t _a)
+	{
+		const uint32_t val    = uint32_and(_a, 0x3ff);
+
+		const uint32_t tmp0   = uint32_sll(val, 16);
+		const uint32_t tmp1   = uint32_xor(val, tmp0);
+		const uint32_t tmp2   = uint32_and(tmp1, 0xff0000ff);
+
+		const uint32_t tmp3   = uint32_sll(tmp2, 8);
+		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_and(tmp4, 0x0300f00f);
+
+		const uint32_t tmp6   = uint32_sll(tmp5, 4);
+		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
+		const uint32_t tmp8   = uint32_and(tmp7, 0x030c30c3);
+
+		const uint32_t tmp9   = uint32_sll(tmp8, 2);
+		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
+		const uint32_t result = uint32_and(tmpA, 0x09249249);
+
+		return result;
+	}
+
+	inline uint32_t uint32_testpow2(uint32_t _a)
+	{
+		const uint32_t tmp0   = uint32_not(_a);
+		const uint32_t tmp1   = uint32_inc(tmp0);
+		const uint32_t tmp2   = uint32_and(_a, tmp1);
+		const uint32_t tmp3   = uint32_cmpeq(tmp2, _a);
+		const uint32_t tmp4   = uint32_cmpneq(_a, 0);
+		const uint32_t result = uint32_and(tmp3, tmp4);
+
+		return result;
+	}
+
+	inline uint32_t uint32_nextpow2(uint32_t _a)
+	{
+		const uint32_t tmp0   = uint32_dec(_a);
+		const uint32_t tmp1   = uint32_srl(tmp0, 1);
+		const uint32_t tmp2   = uint32_or(tmp0, tmp1);
+		const uint32_t tmp3   = uint32_srl(tmp2, 2);
+		const uint32_t tmp4   = uint32_or(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_srl(tmp4, 4);
+		const uint32_t tmp6   = uint32_or(tmp4, tmp5);
+		const uint32_t tmp7   = uint32_srl(tmp6, 8);
+		const uint32_t tmp8   = uint32_or(tmp6, tmp7);
+		const uint32_t tmp9   = uint32_srl(tmp8, 16);
+		const uint32_t tmpA   = uint32_or(tmp8, tmp9);
+		const uint32_t result = uint32_inc(tmpA);
+
+		return result;
+	}
+} // namespace bx
+
+#endif // __BX_UINT32_T_H__