diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..ddb52c3
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,6 @@
+*.cpp eol=lf
+*.h   eol=lf
+*.sc  eol=lf
+*.sh  eol=lf
+*.md  eol=lf
+*.lua eol=lf
diff --git a/README.md b/README.md
index 08cc33b..075a723 100644
--- a/README.md
+++ b/README.md
@@ -1,39 +1,39 @@
-bx
-==
-
-Base library.
-
-Contact
--------
-
-[@bkaradzic](https://twitter.com/bkaradzic)  
-http://www.stuckingeometry.com
-
-Project page  
-https://github.com/bkaradzic/bx
-
-License
--------
-
-Copyright 2010-2012 Branimir Karadzic. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-   1. Redistributions of source code must retain the above copyright notice, this
-      list of conditions and the following disclaimer.
-
-   2. Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the documentation
-      and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
-SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
+bx
+==
+
+Base library.
+
+Contact
+-------
+
+[@bkaradzic](https://twitter.com/bkaradzic)  
+http://www.stuckingeometry.com
+
+Project page  
+https://github.com/bkaradzic/bx
+
+License
+-------
+
+Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/include/bx/bx.h b/include/bx/bx.h
index 3429a42..4543e50 100644
--- a/include/bx/bx.h
+++ b/include/bx/bx.h
@@ -1,23 +1,23 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_H__
-#define __BX_H__
-
-#include <stdint.h>
-#include "platform.h"
-#include "macros.h"
-
-namespace bx
-{
-}// namespace bx
-
-#ifndef BX_NAMESPACE
-#	define BX_NAMESPACE 0
-#elif BX_NAMESPACE
-using namespace bx;
-#endif // BX_NAMESPACE
-
-#endif // __BX_H__ 
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_H__
+#define __BX_H__
+
+#include <stdint.h>
+#include "platform.h"
+#include "macros.h"
+
+namespace bx
+{
+}// namespace bx
+
+#ifndef BX_NAMESPACE
+#	define BX_NAMESPACE 0
+#elif BX_NAMESPACE
+using namespace bx;
+#endif // BX_NAMESPACE
+
+#endif // __BX_H__ 
diff --git a/include/bx/commandline.h b/include/bx/commandline.h
index 522c86f..c472baa 100644
--- a/include/bx/commandline.h
+++ b/include/bx/commandline.h
@@ -1,164 +1,164 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_COMMANDLINE_H__
-#define __BX_COMMANDLINE_H__
-
-#include "bx.h"
-#include "string.h"
-
-namespace bx
-{
-	class CommandLine
-	{
-	public:
-		CommandLine(int _argc, char const* const* _argv)
-			: m_argc(_argc)
-			, m_argv(_argv)
-		{
-		}
-
-		const char* findOption(const char* _long, const char* _default) const
-		{
-			const char* result = find('\0', _long, 1);
-			return result == NULL ? _default : result;
-		}
-
-		const char* findOption(const char _short, const char* _long, const char* _default) const
-		{
-			const char* result = find(_short, _long, 1);
-			return result == NULL ? _default : result;
-		}
-
-		const char* findOption(const char* _long, int _numParams = 1) const
-		{
-			const char* result = find('\0', _long, _numParams);
-			return result;
-		}
-
-		const char* findOption(const char _short, const char* _long = NULL, int _numParams = 1) const
-		{
-			const char* result = find(_short, _long, _numParams);
-			return result;
-		}
-
-		bool hasArg(const char _short, const char* _long = NULL) const
-		{
-			const char* arg = findOption(_short, _long, 0);
-			return NULL != arg;
-		}
-
-		bool hasArg(const char* _long) const
-		{
-			const char* arg = findOption('\0', _long, 0);
-			return NULL != arg;
-		}
-
-		bool hasArg(const char*& _value, const char _short, const char* _long = NULL) const
-		{
-			const char* arg = findOption(_short, _long, 1);
-			_value = arg;
-			return NULL != arg;
-		}
-
-		bool hasArg(int& _value, const char _short, const char* _long = NULL) const
-		{
-			const char* arg = findOption(_short, _long, 1);
-			if (NULL != arg)
-			{
-				_value = atoi(arg);
-				return true;
-			}
-
-			return false;
-		}
-
-		bool hasArg(unsigned int& _value, const char _short, const char* _long = NULL) const
-		{
-			const char* arg = findOption(_short, _long, 1);
-			if (NULL != arg)
-			{
-				_value = atoi(arg);
-				return true;
-			}
-
-			return false;
-		}
-
-		bool hasArg(bool& _value, const char _short, const char* _long = NULL) const
-		{
-			const char* arg = findOption(_short, _long, 1);
-			if (NULL != arg)
-			{
-				if ('0' == *arg || stricmp(arg, "false") )
-				{
-					_value = false;
-				}
-				else if ('0' != *arg || stricmp(arg, "true") )
-				{
-					_value = true;
-				}
-
-				return true;
-			}
-
-			return false;
-		}
-
-	private:
-		const char* find(const char _short, const char* _long, int _numParams) const
-		{
-			for (int ii = 0; ii < m_argc; ++ii)
-			{
-				const char* arg = m_argv[ii];
-				if ('-' == *arg)
-				{
-					++arg;
-					if (_short == *arg)
-					{
-						if (1 == strlen(arg) )
-						{
-							if (0 == _numParams)
-							{
-								return "";
-							}
-							else if (ii+_numParams < m_argc
-								 && '-' != *m_argv[ii+1] )
-							{
-								return m_argv[ii+1];
-							}
-
-							return NULL;
-						}
-					}
-					else if (NULL != _long
-						 &&  '-' == *arg
-						 &&  0 == stricmp(arg+1, _long) )
-					{
-						if (0 == _numParams)
-						{
-							return "";
-						}
-						else if (ii+_numParams < m_argc
-								&&  '-' != *m_argv[ii+1] )
-						{
-							return m_argv[ii+1];
-						}
-
-						return NULL;
-					}
-				}
-			}
-
-			return NULL;
-		}
-
-		int m_argc;
-		char const* const* m_argv;
-	};
-
-} // namespace bx
-
-#endif /// __BX_COMMANDLINE_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_COMMANDLINE_H__
+#define __BX_COMMANDLINE_H__
+
+#include "bx.h"
+#include "string.h"
+
+namespace bx
+{
+	class CommandLine
+	{
+	public:
+		CommandLine(int _argc, char const* const* _argv)
+			: m_argc(_argc)
+			, m_argv(_argv)
+		{
+		}
+
+		const char* findOption(const char* _long, const char* _default) const
+		{
+			const char* result = find('\0', _long, 1);
+			return result == NULL ? _default : result;
+		}
+
+		const char* findOption(const char _short, const char* _long, const char* _default) const
+		{
+			const char* result = find(_short, _long, 1);
+			return result == NULL ? _default : result;
+		}
+
+		const char* findOption(const char* _long, int _numParams = 1) const
+		{
+			const char* result = find('\0', _long, _numParams);
+			return result;
+		}
+
+		const char* findOption(const char _short, const char* _long = NULL, int _numParams = 1) const
+		{
+			const char* result = find(_short, _long, _numParams);
+			return result;
+		}
+
+		bool hasArg(const char _short, const char* _long = NULL) const
+		{
+			const char* arg = findOption(_short, _long, 0);
+			return NULL != arg;
+		}
+
+		bool hasArg(const char* _long) const
+		{
+			const char* arg = findOption('\0', _long, 0);
+			return NULL != arg;
+		}
+
+		bool hasArg(const char*& _value, const char _short, const char* _long = NULL) const
+		{
+			const char* arg = findOption(_short, _long, 1);
+			_value = arg;
+			return NULL != arg;
+		}
+
+		bool hasArg(int& _value, const char _short, const char* _long = NULL) const
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				_value = atoi(arg);
+				return true;
+			}
+
+			return false;
+		}
+
+		bool hasArg(unsigned int& _value, const char _short, const char* _long = NULL) const
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				_value = atoi(arg);
+				return true;
+			}
+
+			return false;
+		}
+
+		bool hasArg(bool& _value, const char _short, const char* _long = NULL) const
+		{
+			const char* arg = findOption(_short, _long, 1);
+			if (NULL != arg)
+			{
+				if ('0' == *arg || stricmp(arg, "false") )
+				{
+					_value = false;
+				}
+				else if ('0' != *arg || stricmp(arg, "true") )
+				{
+					_value = true;
+				}
+
+				return true;
+			}
+
+			return false;
+		}
+
+	private:
+		const char* find(const char _short, const char* _long, int _numParams) const
+		{
+			for (int ii = 0; ii < m_argc; ++ii)
+			{
+				const char* arg = m_argv[ii];
+				if ('-' == *arg)
+				{
+					++arg;
+					if (_short == *arg)
+					{
+						if (1 == strlen(arg) )
+						{
+							if (0 == _numParams)
+							{
+								return "";
+							}
+							else if (ii+_numParams < m_argc
+								 && '-' != *m_argv[ii+1] )
+							{
+								return m_argv[ii+1];
+							}
+
+							return NULL;
+						}
+					}
+					else if (NULL != _long
+						 &&  '-' == *arg
+						 &&  0 == stricmp(arg+1, _long) )
+					{
+						if (0 == _numParams)
+						{
+							return "";
+						}
+						else if (ii+_numParams < m_argc
+								&&  '-' != *m_argv[ii+1] )
+						{
+							return m_argv[ii+1];
+						}
+
+						return NULL;
+					}
+				}
+			}
+
+			return NULL;
+		}
+
+		int m_argc;
+		char const* const* m_argv;
+	};
+
+} // namespace bx
+
+#endif /// __BX_COMMANDLINE_H__
diff --git a/include/bx/countof.h b/include/bx/countof.h
index 128e427..86d6207 100644
--- a/include/bx/countof.h
+++ b/include/bx/countof.h
@@ -1,19 +1,19 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_COUNTOF_H__
-#define __BX_COUNTOF_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	// http://cnicholson.net/2011/01/stupid-c-tricks-a-better-sizeof_array/
-	template<typename T, size_t N> char (&COUNTOF_REQUIRES_ARRAY_ARGUMENT(const T(&)[N]) )[N];
-#define countof(x) sizeof(bx::COUNTOF_REQUIRES_ARRAY_ARGUMENT(x) )
-
-} // namespace bx
-
-#endif // __BX_COUNTOF_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_COUNTOF_H__
+#define __BX_COUNTOF_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	// http://cnicholson.net/2011/01/stupid-c-tricks-a-better-sizeof_array/
+	template<typename T, size_t N> char (&COUNTOF_REQUIRES_ARRAY_ARGUMENT(const T(&)[N]) )[N];
+#define countof(x) sizeof(bx::COUNTOF_REQUIRES_ARRAY_ARGUMENT(x) )
+
+} // namespace bx
+
+#endif // __BX_COUNTOF_H__
diff --git a/include/bx/cpu.h b/include/bx/cpu.h
index 90d3204..be70b35 100644
--- a/include/bx/cpu.h
+++ b/include/bx/cpu.h
@@ -1,102 +1,102 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_CPU_H__
-#define __BX_CPU_H__
-
-#include "bx.h"
-
-#if BX_COMPILER_MSVC
-#	if BX_PLATFORM_XBOX360
-#		include <ppcintrinsics.h>
-#		include <xtl.h>
-#	else
-#		include <math.h> // math.h is included because VS bitches:
-						 // warning C4985: 'ceil': attributes not present on previous declaration.
-						 // must be included before intrin.h.
-#		include <intrin.h>
-#		include <windows.h>
-#	endif // !BX_PLATFORM_XBOX360
-extern "C" void _ReadBarrier();
-extern "C" void _WriteBarrier();
-extern "C" void _ReadWriteBarrier();
-#	pragma intrinsic(_ReadBarrier)
-#	pragma intrinsic(_WriteBarrier)
-#	pragma intrinsic(_ReadWriteBarrier)
-#	pragma intrinsic(_InterlockedIncrement)
-#	pragma intrinsic(_InterlockedDecrement)
-#endif // BX_COMPILER_MSVC
-
-namespace bx
-{
-	inline void readBarrier()
-	{
-#if BX_COMPILER_MSVC
-		_ReadBarrier();
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		asm volatile("":::"memory");
-#endif // BX_COMPILER
-	}
-
-	inline void writeBarrier()
-	{
-#if BX_COMPILER_MSVC
-		_WriteBarrier();
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		asm volatile("":::"memory");
-#endif // BX_COMPILER
-	}
-
-	inline void readWriteBarrier()
-	{
-#if BX_COMPILER_MSVC
-		_ReadWriteBarrier();
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		asm volatile("":::"memory");
-#endif // BX_COMPILER
-	}
-
-	inline void memoryBarrier()
-	{
-#if BX_PLATFORM_XBOX360
-		__lwsync();
-#elif BX_COMPILER_MSVC
-		_mm_mfence();
-#else
-		__sync_synchronize();
-//		asm volatile("mfence":::"memory");
-#endif // BX_COMPILER
-	}
-
-	inline int32_t atomicIncr(volatile void* _var)
-	{
-#if BX_COMPILER_MSVC
-		return _InterlockedIncrement( (volatile LONG*)(_var) );
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		return __sync_fetch_and_add( (volatile int32_t*)_var, 1);
-#endif // BX_COMPILER
-	}
-
-	inline int32_t atomicDecr(volatile void* _var)
-	{
-#if BX_COMPILER_MSVC
-		return _InterlockedDecrement( (volatile LONG*)(_var) );
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		return __sync_fetch_and_sub( (volatile int32_t*)_var, 1);
-#endif // BX_COMPILER
-	}
-
-	inline void* atomicExchangePtr(void** _target, void* _ptr)
-	{
-#if BX_COMPILER_MSVC
-		return InterlockedExchangePointer(_target, _ptr);
-#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
-		return __sync_lock_test_and_set(_target, _ptr);
-#endif // BX_COMPILER
-	}
-
-} // namespace bx
-
-#endif // __BX_CPU_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_CPU_H__
+#define __BX_CPU_H__
+
+#include "bx.h"
+
+#if BX_COMPILER_MSVC
+#	if BX_PLATFORM_XBOX360
+#		include <ppcintrinsics.h>
+#		include <xtl.h>
+#	else
+#		include <math.h> // math.h is included because VS bitches:
+						 // warning C4985: 'ceil': attributes not present on previous declaration.
+						 // must be included before intrin.h.
+#		include <intrin.h>
+#		include <windows.h>
+#	endif // !BX_PLATFORM_XBOX360
+extern "C" void _ReadBarrier();
+extern "C" void _WriteBarrier();
+extern "C" void _ReadWriteBarrier();
+#	pragma intrinsic(_ReadBarrier)
+#	pragma intrinsic(_WriteBarrier)
+#	pragma intrinsic(_ReadWriteBarrier)
+#	pragma intrinsic(_InterlockedIncrement)
+#	pragma intrinsic(_InterlockedDecrement)
+#endif // BX_COMPILER_MSVC
+
+namespace bx
+{
+	inline void readBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_ReadBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void writeBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_WriteBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void readWriteBarrier()
+	{
+#if BX_COMPILER_MSVC
+		_ReadWriteBarrier();
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		asm volatile("":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline void memoryBarrier()
+	{
+#if BX_PLATFORM_XBOX360
+		__lwsync();
+#elif BX_COMPILER_MSVC
+		_mm_mfence();
+#else
+		__sync_synchronize();
+//		asm volatile("mfence":::"memory");
+#endif // BX_COMPILER
+	}
+
+	inline int32_t atomicIncr(volatile void* _var)
+	{
+#if BX_COMPILER_MSVC
+		return _InterlockedIncrement( (volatile LONG*)(_var) );
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_fetch_and_add( (volatile int32_t*)_var, 1);
+#endif // BX_COMPILER
+	}
+
+	inline int32_t atomicDecr(volatile void* _var)
+	{
+#if BX_COMPILER_MSVC
+		return _InterlockedDecrement( (volatile LONG*)(_var) );
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_fetch_and_sub( (volatile int32_t*)_var, 1);
+#endif // BX_COMPILER
+	}
+
+	inline void* atomicExchangePtr(void** _target, void* _ptr)
+	{
+#if BX_COMPILER_MSVC
+		return InterlockedExchangePointer(_target, _ptr);
+#elif BX_COMPILER_GCC || BX_COMPILER_CLANG
+		return __sync_lock_test_and_set(_target, _ptr);
+#endif // BX_COMPILER
+	}
+
+} // namespace bx
+
+#endif // __BX_CPU_H__
diff --git a/include/bx/debug.h b/include/bx/debug.h
index 1ff7a1e..8464c0c 100644
--- a/include/bx/debug.h
+++ b/include/bx/debug.h
@@ -1,31 +1,31 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_DEBUG_H__
-#define __BX_DEBUG_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	inline void debugBreak()
-	{
-#if BX_COMPILER_MSVC
-		__debugbreak();
-#elif BX_CPU_ARM
-		asm("bkpt 0");
-#elif !BX_PLATFORM_NACL && BX_CPU_X86 && (BX_COMPILER_GCC || BX_COMPILER_CLANG)
-		// NaCl doesn't like int 3:
-		// NativeClient: NaCl module load failed: Validation failure. File violates Native Client safety rules.
-		__asm__ ("int $3");
-#else // cross platform implementation
-		int* int3 = (int*)3L;
-		*int3 = 3;
-#endif // BX
-	}
-
-} // namespace bx
-
-#endif // __BX_DEBUG_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_DEBUG_H__
+#define __BX_DEBUG_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	inline void debugBreak()
+	{
+#if BX_COMPILER_MSVC
+		__debugbreak();
+#elif BX_CPU_ARM
+		asm("bkpt 0");
+#elif !BX_PLATFORM_NACL && BX_CPU_X86 && (BX_COMPILER_GCC || BX_COMPILER_CLANG)
+		// NaCl doesn't like int 3:
+		// NativeClient: NaCl module load failed: Validation failure. File violates Native Client safety rules.
+		__asm__ ("int $3");
+#else // cross platform implementation
+		int* int3 = (int*)3L;
+		*int3 = 3;
+#endif // BX
+	}
+
+} // namespace bx
+
+#endif // __BX_DEBUG_H__
diff --git a/include/bx/endian.h b/include/bx/endian.h
index 4056dce..5e34841 100644
--- a/include/bx/endian.h
+++ b/include/bx/endian.h
@@ -1,71 +1,71 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_ENDIAN_H__
-#define __BX_ENDIAN_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	inline uint16_t endianSwap(uint16_t _in)
-	{
-		return (_in>>8) | (_in<<8);
-	}
-	
-	inline uint32_t endianSwap(uint32_t _in)
-	{
-		return (_in>>24) | (_in<<24)
-			 | ( (_in&0x00ff0000)>>8) | ( (_in&0x0000ff00)<<8)
-			 ;
-	}
-
-	inline uint64_t endianSwap(uint64_t _in)
-	{
-		return (_in>>56) | (_in<<56)
-			 | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40)
-			 | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24)
-			 | ( (_in&UINT64_C(0x000000ff00000000) )>>8)  | ( (_in&UINT64_C(0x00000000ff000000) )<<8)
-			 ;
-	}
-
-	inline int16_t endianSwap(int16_t _in)
-	{
-		return (int16_t)endianSwap( (uint16_t)_in);
-	}
-
-	inline int32_t endianSwap(int32_t _in)
-	{
-		return (int32_t)endianSwap( (uint32_t)_in);
-	}
-
-	inline int64_t endianSwap(int64_t _in)
-	{
-		return (int64_t)endianSwap( (uint64_t)_in);
-	}
-
-	template <typename Ty>
-	inline Ty littleEndian(Ty& _in)
-	{
-#if BX_CPU_ENDIAN_BIG
-		endianSwap(_in);
-#else
-		return _in;
-#endif // BX_CPU_ENDIAN_BIG
-	}
-
-	template <typename Ty>
-	inline Ty bigEndian(Ty& _in)
-	{
-#if BX_CPU_ENDIAN_LITTLE
-		return endianSwap(_in);
-#else
-		return _in;
-#endif // BX_CPU_ENDIAN_LITTLE
-	}
-
-} // namespace bx
-
-#endif // __BX_ENDIAN_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_ENDIAN_H__
+#define __BX_ENDIAN_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	inline uint16_t endianSwap(uint16_t _in)
+	{
+		return (_in>>8) | (_in<<8);
+	}
+	
+	inline uint32_t endianSwap(uint32_t _in)
+	{
+		return (_in>>24) | (_in<<24)
+			 | ( (_in&0x00ff0000)>>8) | ( (_in&0x0000ff00)<<8)
+			 ;
+	}
+
+	inline uint64_t endianSwap(uint64_t _in)
+	{
+		return (_in>>56) | (_in<<56)
+			 | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40)
+			 | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24)
+			 | ( (_in&UINT64_C(0x000000ff00000000) )>>8)  | ( (_in&UINT64_C(0x00000000ff000000) )<<8)
+			 ;
+	}
+
+	inline int16_t endianSwap(int16_t _in)
+	{
+		return (int16_t)endianSwap( (uint16_t)_in);
+	}
+
+	inline int32_t endianSwap(int32_t _in)
+	{
+		return (int32_t)endianSwap( (uint32_t)_in);
+	}
+
+	inline int64_t endianSwap(int64_t _in)
+	{
+		return (int64_t)endianSwap( (uint64_t)_in);
+	}
+
+	template <typename Ty>
+	inline Ty littleEndian(Ty& _in)
+	{
+#if BX_CPU_ENDIAN_BIG
+		endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_BIG
+	}
+
+	template <typename Ty>
+	inline Ty bigEndian(Ty& _in)
+	{
+#if BX_CPU_ENDIAN_LITTLE
+		return endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_LITTLE
+	}
+
+} // namespace bx
+
+#endif // __BX_ENDIAN_H__
diff --git a/include/bx/float4_neon.h b/include/bx/float4_neon.h
index d22e76a..f2668a3 100644
--- a/include/bx/float4_neon.h
+++ b/include/bx/float4_neon.h
@@ -1,244 +1,244 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4_NEON_H__
-#define __BX_FLOAT4_NEON_H__
-
-#include <arm_neon.h>
-
-namespace bx
-{
-
-// Reference:
-// http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
-// http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-// http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/
-// http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/
-// http://blogs.arm.com/software-enablement/684-coding-for-neon-part-5-rearranging-vectors/
-
-	typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) );
-
-#define ELEMx 0
-#define ELEMy 1
-#define ELEMz 2
-#define ELEMw 3
-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
-			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
-			{ \
-				float4_t result; \
-				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
-				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
-				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
-				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
-				return result; \
-			}
-
-#include "float4_swizzle.inl"
-
-#undef IMPLEMENT_SWIZZLE
-#undef ELEMw
-#undef ELEMz
-#undef ELEMy
-#undef ELEMx
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_movelh_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_movelh_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_movehl_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_movehl_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_unpacklo_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_unpacklo_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_unpackhi_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
-	{
-		return _a; //_mm_unpackhi_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float float4_x(float4_t _a)
-	{
-		return _a.fxyzw[0];
-	}
-
-	BX_FLOAT4_INLINE float float4_y(float4_t _a)
-	{
-		return _a.fxyzw[1];
-	}
-
-	BX_FLOAT4_INLINE float float4_z(float4_t _a)
-	{
-		return _a.fxyzw[2];
-	}
-
-	BX_FLOAT4_INLINE float float4_w(float4_t _a)
-	{
-		return _a.fxyzw[3];
-	}
-
-//	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
-//	{
-//		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
-//	}
-
-//	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
-//	{
-//		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
-//	}
-
-//	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
-//	{
-//		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
-//	}
-
-	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
-	{
-		const float32_t val[4] = {_x, _y, _z, _w};
-		return __builtin_neon_vld1v4sf(val);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
-	{
-		const uint32_t val[4] = {_x, _y, _z, _w};
-		return (float4_t)__builtin_neon_vld1v4si( (const __builtin_neon_si*)val);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
-	{
-		return __builtin_neon_vdup_nv4sf(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
-	{
-		return (float4_t)__builtin_neon_vdup_nv4si( (__builtin_neon_si)_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_zero()
-	{
-		return vdupq_n_f32(0.0f);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
-	{
-		return vaddq_f32(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
-	{
-		return vsubq_f32(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
-	{
-		return vmulq_f32(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
-	{
-		return vrecpeq_f32(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
-	{
-		return vrsqrteq_f32(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
-	{
-		return (float4_t)__builtin_neon_vandv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
-	}
-
-	//BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
-	//{
-	//	return _mm_andnot_ps(_b, _a);
-	//}
-
-	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
-	{
-		return (float4_t)__builtin_neon_vorrv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
-	{
-		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
-		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
-		const uint32x4_t add  = vaddq_u32(tmp0, tmp1);
-		const float4_t result = vreinterpretq_f32_u32(add);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
-	{
-		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
-		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
-		const uint32x4_t sub  = vsubq_u32(tmp0, tmp1);
-		const float4_t result = vreinterpretq_f32_u32(sub);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
-	{
-		const uint32x4_t tmp   = vreinterpretq_u32_f32(_a);
-		const uint32x4_t shift = vshlq_n_u32(tmp, _count);
-		const float4_t result  = vreinterpretq_f32_u32(shift);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
-	{
-		const uint32x4_t tmp   = vreinterpretq_i32_f32(_a);
-		const uint32x4_t shift = (uint32x4_t)__builtin_neon_vshr_nv4si( (int32x4_t)tmp, _count, 0);
-		const float4_t result  = vreinterpretq_f32_u32(shift);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
-	{
-		const int32x4_t a     = vreinterpretq_s32_f32(_a);
-		const int32x4_t shift = __builtin_neon_vshr_nv4si(a, _count, 1);
-		const float4_t result = vreinterpretq_f32_s32(shift);
-
-		return result;
-	}
-
-} // namespace bx
-
-#define float4_div_nr float4_div_nr_ni
-#define float4_div float4_div_nr_ni
-#define float4_ceil float4_ceil_ni
-#define float4_floor float4_floor_ni
-#include "float4_ni.h"
-
-#endif // __BX_FLOAT4_NEON_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_NEON_H__
+#define __BX_FLOAT4_NEON_H__
+
+#include <arm_neon.h>
+
+namespace bx
+{
+
+// Reference:
+// http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
+// http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+// http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/
+// http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/
+// http://blogs.arm.com/software-enablement/684-coding-for-neon-part-5-rearranging-vectors/
+
+	typedef __builtin_neon_sf float4_t __attribute__( (__vector_size__(16) ) );
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				float4_t result; \
+				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
+				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
+				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
+				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
+				return result; \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movelh_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movelh_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movehl_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_movehl_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpacklo_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpacklo_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpackhi_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		return _a; //_mm_unpackhi_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _a.fxyzw[0];
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		return _a.fxyzw[1];
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		return _a.fxyzw[2];
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		return _a.fxyzw[3];
+	}
+
+//	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+//	{
+//		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
+//	}
+
+//	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+//	{
+//		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
+//	}
+
+//	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
+//	{
+//		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
+//	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		const float32_t val[4] = {_x, _y, _z, _w};
+		return __builtin_neon_vld1v4sf(val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		const uint32_t val[4] = {_x, _y, _z, _w};
+		return (float4_t)__builtin_neon_vld1v4si( (const __builtin_neon_si*)val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return __builtin_neon_vdup_nv4sf(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		return (float4_t)__builtin_neon_vdup_nv4si( (__builtin_neon_si)_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return vdupq_n_f32(0.0f);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		return vaddq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		return vsubq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		return vmulq_f32(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		return vrecpeq_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		return vrsqrteq_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		return (float4_t)__builtin_neon_vandv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+	}
+
+	//BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	//{
+	//	return _mm_andnot_ps(_b, _a);
+	//}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		return (float4_t)__builtin_neon_vorrv4si( (int32x4_t)_a, (int32x4_t)_b, 0);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
+		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
+		const uint32x4_t add  = vaddq_u32(tmp0, tmp1);
+		const float4_t result = vreinterpretq_f32_u32(add);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
+		const uint32x4_t tmp1 = vreinterpretq_u32_f32(_b);
+		const uint32x4_t sub  = vsubq_u32(tmp0, tmp1);
+		const float4_t result = vreinterpretq_f32_u32(sub);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		const uint32x4_t tmp   = vreinterpretq_u32_f32(_a);
+		const uint32x4_t shift = vshlq_n_u32(tmp, _count);
+		const float4_t result  = vreinterpretq_f32_u32(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		const uint32x4_t tmp   = vreinterpretq_i32_f32(_a);
+		const uint32x4_t shift = (uint32x4_t)__builtin_neon_vshr_nv4si( (int32x4_t)tmp, _count, 0);
+		const float4_t result  = vreinterpretq_f32_u32(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		const int32x4_t a     = vreinterpretq_s32_f32(_a);
+		const int32x4_t shift = __builtin_neon_vshr_nv4si(a, _count, 1);
+		const float4_t result = vreinterpretq_f32_s32(shift);
+
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_div_nr float4_div_nr_ni
+#define float4_div float4_div_nr_ni
+#define float4_ceil float4_ceil_ni
+#define float4_floor float4_floor_ni
+#include "float4_ni.h"
+
+#endif // __BX_FLOAT4_NEON_H__
diff --git a/include/bx/float4_ni.h b/include/bx/float4_ni.h
index 7d998de..d423545 100644
--- a/include/bx/float4_ni.h
+++ b/include/bx/float4_ni.h
@@ -1,431 +1,431 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4_NI_H__
-#define __BX_FLOAT4_NI_H__
-
-namespace bx
-{
-	BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
-		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
-		const float4_t result = float4_shuf_xyAB(xAyB, zCwD);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
-		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
-		const float4_t result = float4_shuf_zwCD(xAyB, zCwD);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c)
-	{
-		const float4_t mul    = float4_mul(_a, _b);
-		const float4_t result = float4_add(mul, _c);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c)
-	{
-		const float4_t mul    = float4_mul(_a, _b);
-		const float4_t result = float4_sub(_c, mul);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t oneish  = float4_isplat(0x3f800001);
-		const float4_t est     = float4_rcp_est(_b);
-		const float4_t iter0   = float4_mul(_a, est);
-		const float4_t tmp1    = float4_nmsub(_b, est, oneish);
-		const float4_t result  = float4_madd(tmp1, iter0, iter0);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a)
-	{
-		const float4_t one    = float4_splat(1.0f);
-		const float4_t result = float4_div(one, _a);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a)
-	{
-		const float4_t zwxy   = float4_swiz_zwxy(_a);
-		const float4_t tmp0   = float4_or(_a, zwxy);
-		const float4_t tmp1   = float4_swiz_yyyy(_a);
-		const float4_t tmp2   = float4_or(tmp0, tmp1);
-		const float4_t mf000  = float4_ild(-1, 0, 0, 0);
-		const float4_t result = float4_and(tmp2, mf000);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t aorb   = float4_or(_a, _b);
-		const float4_t mffff  = float4_isplat(-1);
-		const float4_t result = float4_xor(aorb, mffff);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a)
-	{
-		const float4_t zero   = float4_zero();
-		const float4_t result = float4_sub(zero, _a);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b)
-	{
-		const float4_t sel_a  = float4_and(_a, _mask);
-		const float4_t sel_b  = float4_andc(_b, _mask);
-		const float4_t result = float4_or(sel_a, sel_b);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b)
-	{
-		const float4_t mask   = float4_sra(_test, 31);
-		const float4_t result = float4_selb(mask, _a, _b);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a)
-	{
-		const float4_t mffff  = float4_isplat(-1);
-		const float4_t result = float4_xor(_a, mffff);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a)
-	{
-		const float4_t a_neg  = float4_neg(_a);
-		const float4_t result = float4_max(a_neg, _a);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
-	{
-		const float4_t tmp    = float4_min(_a, _max);
-		const float4_t result = float4_max(tmp, _min);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s)
-	{
-		const float4_t ba     = float4_sub(_b, _a);
-		const float4_t result = float4_madd(_s, ba, _a);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a)
-	{
-		const float4_t half   = float4_splat(0.5f);
-		const float4_t one    = float4_splat(1.0f);
-		const float4_t zero   = float4_zero();
-		const float4_t tmp0   = float4_rsqrt_est(_a);
-		const float4_t tmp1   = float4_madd(tmp0, _a, zero);
-		const float4_t tmp2   = float4_madd(tmp1, half, zero);
-		const float4_t tmp3   = float4_nmsub(tmp0, tmp1, one);
-		const float4_t result = float4_madd(tmp3, tmp2, tmp1);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
-	{
-		const float4_t one    = float4_splat(1.0f);
-		const float4_t sqrt   = float4_sqrt(_a);
-		const float4_t result = float4_div(one, sqrt);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a)
-	{
-		const float4_t rsqrt           = float4_rsqrt_est(_a);
-		const float4_t iter0           = float4_mul(_a, rsqrt);
-		const float4_t iter1           = float4_mul(iter0, rsqrt);
-		const float4_t half            = float4_splat(0.5f);
-		const float4_t half_rsqrt      = float4_mul(half, rsqrt);
-		const float4_t three           = float4_splat(3.0f);
-		const float4_t three_sub_iter1 = float4_sub(three, iter1);
-		const float4_t result          = float4_mul(half_rsqrt, three_sub_iter1);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a)
-	{
-		const float4_t half    = float4_splat(0.5f);
-		const float4_t ah      = float4_mul(half, _a);
-		const float4_t ashift  = float4_sra(_a, 1);
-		const float4_t magic   = float4_isplat(0x5f3759df);
-		const float4_t msuba   = float4_isub(magic, ashift);
-		const float4_t msubasq = float4_mul(msuba, msuba);
-		const float4_t tmp0    = float4_splat(1.5f);
-		const float4_t tmp1    = float4_mul(ah, msubasq);
-		const float4_t tmp2    = float4_sub(tmp0, tmp1);
-		const float4_t result  = float4_mul(msuba, tmp2);
-
-		return result;
-	}
-
-	namespace float4_logexp_detail
-	{
-		BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b)
-		{
-			return float4_splat(_b);
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
-		{
-			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly0  = float4_poly0(_a, _c);
-			const float4_t result = float4_madd(poly0, _a, bbbb);
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d)
-		{
-			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly   = float4_poly1(_a, _c, _d);
-			const float4_t result = float4_madd(poly, _a, bbbb);
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e)
-		{
-			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly   = float4_poly2(_a, _c, _d, _e);
-			const float4_t result = float4_madd(poly, _a, bbbb);
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f)
-		{
-			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly   = float4_poly3(_a, _c, _d, _e, _f);
-			const float4_t result = float4_madd(poly, _a, bbbb);
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g)
-		{
-			const float4_t bbbb   = float4_splat(_b);
-			const float4_t poly   = float4_poly4(_a, _c, _d, _e, _f, _g);
-			const float4_t result = float4_madd(poly, _a, bbbb);
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a)
-		{
-#if 1
-			const float4_t result = float4_poly5(_a
-				, 3.11578814719469302614f, -3.32419399085241980044f
-				, 2.59883907202499966007f, -1.23152682416275988241f
-				, 0.318212422185251071475f, -0.0344359067839062357313f
-				);
-#elif 0
-			const float4_t result = float4_poly4(_a
-				, 2.8882704548164776201f, -2.52074962577807006663f
-				, 1.48116647521213171641f, -0.465725644288844778798f
-				, 0.0596515482674574969533f
-				);
-#elif 0
-			const float4_t result = float4_poly3(_a
-				, 2.61761038894603480148f, -1.75647175389045657003f
-				, 0.688243882994381274313f, -0.107254423828329604454f
-				);
-#else
-			const float4_t result = float4_poly2(_a
-				, 2.28330284476918490682f, -1.04913055217340124191f
-				, 0.204446009836232697516f
-				);
-#endif
-
-			return result;
-		}
-
-		BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a)
-		{
-#if 1
-			const float4_t result = float4_poly5(_a
-				, 9.9999994e-1f, 6.9315308e-1f
-				, 2.4015361e-1f, 5.5826318e-2f
-				, 8.9893397e-3f, 1.8775767e-3f
-				);
-#elif 0
-			const float4_t result = float4_poly4(_a
-				, 1.0000026f, 6.9300383e-1f
-				, 2.4144275e-1f, 5.2011464e-2f
-				, 1.3534167e-2f
-				);
-#elif 0
-			const float4_t result = float4_poly3(_a
-				, 9.9992520e-1f, 6.9583356e-1f
-				, 2.2606716e-1f, 7.8024521e-2f
-				);
-#else
-			const float4_t result = float4_poly2(_a
-				, 1.0017247f, 6.5763628e-1f
-				, 3.3718944e-1f
-				);
-#endif // 0
-
-			return result;
-		}
-	} // namespace float4_internal
-
-	BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a)
-	{
-		const float4_t expmask  = float4_isplat(0x7f800000);
-		const float4_t mantmask = float4_isplat(0x007fffff);
-		const float4_t one      = float4_splat(1.0f);
-
-		const float4_t c127     = float4_isplat(127);
-		const float4_t aexp     = float4_and(_a, expmask);
-		const float4_t aexpsr   = float4_srl(aexp, 23);
-		const float4_t tmp0     = float4_isub(aexpsr, c127);
-		const float4_t exp      = float4_itof(tmp0);
-
-		const float4_t amask    = float4_and(_a, mantmask);
-		const float4_t mant     = float4_or(amask, one);
-
-		const float4_t poly     = float4_logexp_detail::float4_logpoly(mant);
-
-		const float4_t mandiff  = float4_sub(mant, one);
-		const float4_t result   = float4_madd(poly, mandiff, exp);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a)
-	{
-		const float4_t min      = float4_splat( 129.0f);
-		const float4_t max      = float4_splat(-126.99999f);
-		const float4_t tmp0     = float4_min(_a, min);
-		const float4_t aaaa     = float4_max(tmp0, max);
-
-		const float4_t half     = float4_splat(0.5f);
-		const float4_t tmp2     = float4_sub(aaaa, half);
-		const float4_t ipart    = float4_ftoi(tmp2);
-		const float4_t iround   = float4_itof(ipart);
-		const float4_t fpart    = float4_sub(aaaa, iround);
-
-		const float4_t c127     = float4_isplat(127);
-		const float4_t tmp5     = float4_iadd(ipart, c127);
-		const float4_t expipart = float4_sll(tmp5, 23);
-
-		const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart);
-
-		const float4_t result   = float4_mul(expipart, expfpart);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t alog2  = float4_log2(_a);
-		const float4_t alog2b = float4_mul(alog2, _b);
-		const float4_t result = float4_exp2(alog2b);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t xyzw   = float4_mul(_a, _b);
-		const float4_t xxxx   = float4_swiz_xxxx(xyzw);
-		const float4_t yyyy   = float4_swiz_yyyy(xyzw);
-		const float4_t zzzz   = float4_swiz_zzzz(xyzw);
-		const float4_t tmp1   = float4_add(xxxx, yyyy);
-		const float4_t result = float4_add(zzzz, tmp1);
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t a_yzxw = float4_swiz_yzxw(_a);
-		const float4_t a_zxyw = float4_swiz_zxyw(_a);
-		const float4_t b_zxyw = float4_swiz_zxyw(_b);
-		const float4_t b_yzxw = float4_swiz_yzxw(_b);
-		const float4_t tmp    = float4_mul(a_yzxw, b_zxyw);
-		const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a)
-	{
-		const float4_t dot3    = float4_dot3(_a, _a);
-		const float4_t invSqrt = float4_rsqrt(dot3);
-		const float4_t result  = float4_mul(_a, invSqrt);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b)
-	{
-		const float4_t xyzw   = float4_mul(_a, _b);
-		const float4_t yzwx   = float4_swiz_yzwx(xyzw);
-		const float4_t tmp0   = float4_add(xyzw, yzwx);
-		const float4_t zwxy   = float4_swiz_zwxy(tmp0);
-		const float4_t result = float4_add(tmp0, zwxy);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ceil_ni(float4_t _a)
-	{
-		const float4_t tmp0   = float4_ftoi(_a);
-		const float4_t tmp1   = float4_itof(tmp0);
-		const float4_t mask   = float4_cmplt(tmp1, _a);
-		const float4_t one    = float4_splat(1.0f);
-		const float4_t tmp2   = float4_and(one, mask);
-		const float4_t result = float4_add(tmp1, tmp2);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_floor_ni(float4_t _a)
-	{
-		const float4_t tmp0   = float4_ftoi(_a);
-		const float4_t tmp1   = float4_itof(tmp0);
-		const float4_t mask   = float4_cmpgt(tmp1, _a);
-		const float4_t one    = float4_splat(1.0f);
-		const float4_t tmp2   = float4_and(one, mask);
-		const float4_t result = float4_sub(tmp1, tmp2);
-
-		return result;
-	}
-
-} // namespace bx
-
-#endif // __BX_FLOAT4_NI_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_NI_H__
+#define __BX_FLOAT4_NI_H__
+
+namespace bx
+{
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAzC_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
+		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
+		const float4_t result = float4_shuf_xyAB(xAyB, zCwD);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBwD_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xAyB   = float4_shuf_xAyB(_a, _b);
+		const float4_t zCwD   = float4_shuf_zCwD(_a, _b);
+		const float4_t result = float4_shuf_zwCD(xAyB, zCwD);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_madd_ni(float4_t _a, float4_t _b, float4_t _c)
+	{
+		const float4_t mul    = float4_mul(_a, _b);
+		const float4_t result = float4_add(mul, _c);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_nmsub_ni(float4_t _a, float4_t _b, float4_t _c)
+	{
+		const float4_t mul    = float4_mul(_a, _b);
+		const float4_t result = float4_sub(_c, mul);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div_nr_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t oneish  = float4_isplat(0x3f800001);
+		const float4_t est     = float4_rcp_est(_b);
+		const float4_t iter0   = float4_mul(_a, est);
+		const float4_t tmp1    = float4_nmsub(_b, est, oneish);
+		const float4_t result  = float4_madd(tmp1, iter0, iter0);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_ni(float4_t _a)
+	{
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t result = float4_div(one, _a);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_orx_ni(float4_t _a)
+	{
+		const float4_t zwxy   = float4_swiz_zwxy(_a);
+		const float4_t tmp0   = float4_or(_a, zwxy);
+		const float4_t tmp1   = float4_swiz_yyyy(_a);
+		const float4_t tmp2   = float4_or(tmp0, tmp1);
+		const float4_t mf000  = float4_ild(-1, 0, 0, 0);
+		const float4_t result = float4_and(tmp2, mf000);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_orc_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t aorb   = float4_or(_a, _b);
+		const float4_t mffff  = float4_isplat(-1);
+		const float4_t result = float4_xor(aorb, mffff);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_neg_ni(float4_t _a)
+	{
+		const float4_t zero   = float4_zero();
+		const float4_t result = float4_sub(zero, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_selb_ni(float4_t _mask, float4_t _a, float4_t _b)
+	{
+		const float4_t sel_a  = float4_and(_a, _mask);
+		const float4_t sel_b  = float4_andc(_b, _mask);
+		const float4_t result = float4_or(sel_a, sel_b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sels_ni(float4_t _test, float4_t _a, float4_t _b)
+	{
+		const float4_t mask   = float4_sra(_test, 31);
+		const float4_t result = float4_selb(mask, _a, _b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_not_ni(float4_t _a)
+	{
+		const float4_t mffff  = float4_isplat(-1);
+		const float4_t result = float4_xor(_a, mffff);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_abs_ni(float4_t _a)
+	{
+		const float4_t a_neg  = float4_neg(_a);
+		const float4_t result = float4_max(a_neg, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_clamp_ni(float4_t _a, float4_t _min, float4_t _max)
+	{
+		const float4_t tmp    = float4_min(_a, _max);
+		const float4_t result = float4_max(tmp, _min);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_lerp_ni(float4_t _a, float4_t _b, float4_t _s)
+	{
+		const float4_t ba     = float4_sub(_b, _a);
+		const float4_t result = float4_madd(_s, ba, _a);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt_nr_ni(float4_t _a)
+	{
+		const float4_t half   = float4_splat(0.5f);
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t zero   = float4_zero();
+		const float4_t tmp0   = float4_rsqrt_est(_a);
+		const float4_t tmp1   = float4_madd(tmp0, _a, zero);
+		const float4_t tmp2   = float4_madd(tmp1, half, zero);
+		const float4_t tmp3   = float4_nmsub(tmp0, tmp1, one);
+		const float4_t result = float4_madd(tmp3, tmp2, tmp1);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_ni(float4_t _a)
+	{
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t sqrt   = float4_sqrt(_a);
+		const float4_t result = float4_div(one, sqrt);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_nr_ni(float4_t _a)
+	{
+		const float4_t rsqrt           = float4_rsqrt_est(_a);
+		const float4_t iter0           = float4_mul(_a, rsqrt);
+		const float4_t iter1           = float4_mul(iter0, rsqrt);
+		const float4_t half            = float4_splat(0.5f);
+		const float4_t half_rsqrt      = float4_mul(half, rsqrt);
+		const float4_t three           = float4_splat(3.0f);
+		const float4_t three_sub_iter1 = float4_sub(three, iter1);
+		const float4_t result          = float4_mul(half_rsqrt, three_sub_iter1);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_carmack_ni(float4_t _a)
+	{
+		const float4_t half    = float4_splat(0.5f);
+		const float4_t ah      = float4_mul(half, _a);
+		const float4_t ashift  = float4_sra(_a, 1);
+		const float4_t magic   = float4_isplat(0x5f3759df);
+		const float4_t msuba   = float4_isub(magic, ashift);
+		const float4_t msubasq = float4_mul(msuba, msuba);
+		const float4_t tmp0    = float4_splat(1.5f);
+		const float4_t tmp1    = float4_mul(ah, msubasq);
+		const float4_t tmp2    = float4_sub(tmp0, tmp1);
+		const float4_t result  = float4_mul(msuba, tmp2);
+
+		return result;
+	}
+
+	namespace float4_logexp_detail
+	{
+		BX_FLOAT4_INLINE float4_t float4_poly0(float4_t _a, float _b)
+		{
+			return float4_splat(_b);
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly1(float4_t _a, float _b, float _c)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly0  = float4_poly0(_a, _c);
+			const float4_t result = float4_madd(poly0, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly2(float4_t _a, float _b, float _c, float _d)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly1(_a, _c, _d);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly3(float4_t _a, float _b, float _c, float _d, float _e)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly2(_a, _c, _d, _e);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly4(float4_t _a, float _b, float _c, float _d, float _e, float _f)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly3(_a, _c, _d, _e, _f);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_poly5(float4_t _a, float _b, float _c, float _d, float _e, float _f, float _g)
+		{
+			const float4_t bbbb   = float4_splat(_b);
+			const float4_t poly   = float4_poly4(_a, _c, _d, _e, _f, _g);
+			const float4_t result = float4_madd(poly, _a, bbbb);
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_logpoly(float4_t _a)
+		{
+#if 1
+			const float4_t result = float4_poly5(_a
+				, 3.11578814719469302614f, -3.32419399085241980044f
+				, 2.59883907202499966007f, -1.23152682416275988241f
+				, 0.318212422185251071475f, -0.0344359067839062357313f
+				);
+#elif 0
+			const float4_t result = float4_poly4(_a
+				, 2.8882704548164776201f, -2.52074962577807006663f
+				, 1.48116647521213171641f, -0.465725644288844778798f
+				, 0.0596515482674574969533f
+				);
+#elif 0
+			const float4_t result = float4_poly3(_a
+				, 2.61761038894603480148f, -1.75647175389045657003f
+				, 0.688243882994381274313f, -0.107254423828329604454f
+				);
+#else
+			const float4_t result = float4_poly2(_a
+				, 2.28330284476918490682f, -1.04913055217340124191f
+				, 0.204446009836232697516f
+				);
+#endif
+
+			return result;
+		}
+
+		BX_FLOAT4_INLINE float4_t float4_exppoly(float4_t _a)
+		{
+#if 1
+			const float4_t result = float4_poly5(_a
+				, 9.9999994e-1f, 6.9315308e-1f
+				, 2.4015361e-1f, 5.5826318e-2f
+				, 8.9893397e-3f, 1.8775767e-3f
+				);
+#elif 0
+			const float4_t result = float4_poly4(_a
+				, 1.0000026f, 6.9300383e-1f
+				, 2.4144275e-1f, 5.2011464e-2f
+				, 1.3534167e-2f
+				);
+#elif 0
+			const float4_t result = float4_poly3(_a
+				, 9.9992520e-1f, 6.9583356e-1f
+				, 2.2606716e-1f, 7.8024521e-2f
+				);
+#else
+			const float4_t result = float4_poly2(_a
+				, 1.0017247f, 6.5763628e-1f
+				, 3.3718944e-1f
+				);
+#endif // 0
+
+			return result;
+		}
+	} // namespace float4_internal
+
+	BX_FLOAT4_INLINE float4_t float4_log2_ni(float4_t _a)
+	{
+		const float4_t expmask  = float4_isplat(0x7f800000);
+		const float4_t mantmask = float4_isplat(0x007fffff);
+		const float4_t one      = float4_splat(1.0f);
+
+		const float4_t c127     = float4_isplat(127);
+		const float4_t aexp     = float4_and(_a, expmask);
+		const float4_t aexpsr   = float4_srl(aexp, 23);
+		const float4_t tmp0     = float4_isub(aexpsr, c127);
+		const float4_t exp      = float4_itof(tmp0);
+
+		const float4_t amask    = float4_and(_a, mantmask);
+		const float4_t mant     = float4_or(amask, one);
+
+		const float4_t poly     = float4_logexp_detail::float4_logpoly(mant);
+
+		const float4_t mandiff  = float4_sub(mant, one);
+		const float4_t result   = float4_madd(poly, mandiff, exp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_exp2_ni(float4_t _a)
+	{
+		const float4_t min      = float4_splat( 129.0f);
+		const float4_t max      = float4_splat(-126.99999f);
+		const float4_t tmp0     = float4_min(_a, min);
+		const float4_t aaaa     = float4_max(tmp0, max);
+
+		const float4_t half     = float4_splat(0.5f);
+		const float4_t tmp2     = float4_sub(aaaa, half);
+		const float4_t ipart    = float4_ftoi(tmp2);
+		const float4_t iround   = float4_itof(ipart);
+		const float4_t fpart    = float4_sub(aaaa, iround);
+
+		const float4_t c127     = float4_isplat(127);
+		const float4_t tmp5     = float4_iadd(ipart, c127);
+		const float4_t expipart = float4_sll(tmp5, 23);
+
+		const float4_t expfpart = float4_logexp_detail::float4_exppoly(fpart);
+
+		const float4_t result   = float4_mul(expipart, expfpart);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_pow_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t alog2  = float4_log2(_a);
+		const float4_t alog2b = float4_mul(alog2, _b);
+		const float4_t result = float4_exp2(alog2b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot3_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xyzw   = float4_mul(_a, _b);
+		const float4_t xxxx   = float4_swiz_xxxx(xyzw);
+		const float4_t yyyy   = float4_swiz_yyyy(xyzw);
+		const float4_t zzzz   = float4_swiz_zzzz(xyzw);
+		const float4_t tmp1   = float4_add(xxxx, yyyy);
+		const float4_t result = float4_add(zzzz, tmp1);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cross3_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t a_yzxw = float4_swiz_yzxw(_a);
+		const float4_t a_zxyw = float4_swiz_zxyw(_a);
+		const float4_t b_zxyw = float4_swiz_zxyw(_b);
+		const float4_t b_yzxw = float4_swiz_yzxw(_b);
+		const float4_t tmp    = float4_mul(a_yzxw, b_zxyw);
+		const float4_t result = float4_nmsub(a_zxyw, b_yzxw, tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_normalize3_ni(float4_t _a)
+	{
+		const float4_t dot3    = float4_dot3(_a, _a);
+		const float4_t invSqrt = float4_rsqrt(dot3);
+		const float4_t result  = float4_mul(_a, invSqrt);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot_ni(float4_t _a, float4_t _b)
+	{
+		const float4_t xyzw   = float4_mul(_a, _b);
+		const float4_t yzwx   = float4_swiz_yzwx(xyzw);
+		const float4_t tmp0   = float4_add(xyzw, yzwx);
+		const float4_t zwxy   = float4_swiz_zwxy(tmp0);
+		const float4_t result = float4_add(tmp0, zwxy);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ceil_ni(float4_t _a)
+	{
+		const float4_t tmp0   = float4_ftoi(_a);
+		const float4_t tmp1   = float4_itof(tmp0);
+		const float4_t mask   = float4_cmplt(tmp1, _a);
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t tmp2   = float4_and(one, mask);
+		const float4_t result = float4_add(tmp1, tmp2);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_floor_ni(float4_t _a)
+	{
+		const float4_t tmp0   = float4_ftoi(_a);
+		const float4_t tmp1   = float4_itof(tmp0);
+		const float4_t mask   = float4_cmpgt(tmp1, _a);
+		const float4_t one    = float4_splat(1.0f);
+		const float4_t tmp2   = float4_and(one, mask);
+		const float4_t result = float4_sub(tmp1, tmp2);
+
+		return result;
+	}
+
+} // namespace bx
+
+#endif // __BX_FLOAT4_NI_H__
diff --git a/include/bx/float4_ref.h b/include/bx/float4_ref.h
index e9dde1a..4a9cf04 100644
--- a/include/bx/float4_ref.h
+++ b/include/bx/float4_ref.h
@@ -1,529 +1,529 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4_REF_H__
-#define __BX_FLOAT4_REF_H__
-
-#include <math.h> // sqrtf
-
-namespace bx
-{
-	typedef union float4_t
-	{
-		int32_t  ixyzw[4];
-		uint32_t uxyzw[4];
-		float    fxyzw[4];
-
-	} float4_t;
-
-#define ELEMx 0
-#define ELEMy 1
-#define ELEMz 2
-#define ELEMw 3
-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
-			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
-			{ \
-				float4_t result; \
-				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
-				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
-				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
-				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
-				return result; \
-			}
-
-#include "float4_swizzle.inl"
-
-#undef IMPLEMENT_SWIZZLE
-#undef ELEMw
-#undef ELEMz
-#undef ELEMy
-#undef ELEMx
-
-#define IMPLEMENT_TEST(_xyzw, _mask) \
-			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
-			{ \
-				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
-				             | ( (_test.uxyzw[2]>>31)<<2) \
-				             | ( (_test.uxyzw[1]>>31)<<1) \
-				             | (_test.uxyzw[0]>>31) \
-				             ; \
-				return 0 != (tmp&(_mask) ); \
-			} \
-			\
-			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
-			{ \
-				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
-				             | ( (_test.uxyzw[2]>>31)<<2) \
-				             | ( (_test.uxyzw[1]>>31)<<1) \
-				             | (_test.uxyzw[0]>>31) \
-				             ; \
-				return (_mask) == (tmp&(_mask) ); \
-			}
-
-IMPLEMENT_TEST(x    , 0x1);
-IMPLEMENT_TEST(y    , 0x2);
-IMPLEMENT_TEST(xy   , 0x3);
-IMPLEMENT_TEST(z    , 0x4);
-IMPLEMENT_TEST(xz   , 0x5);
-IMPLEMENT_TEST(yz   , 0x6);
-IMPLEMENT_TEST(xyz  , 0x7);
-IMPLEMENT_TEST(w    , 0x8);
-IMPLEMENT_TEST(xw   , 0x9);
-IMPLEMENT_TEST(yw   , 0xa);
-IMPLEMENT_TEST(xyw  , 0xb);
-IMPLEMENT_TEST(zw   , 0xc);
-IMPLEMENT_TEST(xzw  , 0xd);
-IMPLEMENT_TEST(yzw  , 0xe);
-IMPLEMENT_TEST(xyzw , 0xf);
-
-#undef IMPLEMENT_TEST
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0];
-		result.uxyzw[1] = _a.uxyzw[1];
-		result.uxyzw[2] = _b.uxyzw[0];
-		result.uxyzw[3] = _b.uxyzw[1];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _b.uxyzw[0];
-		result.uxyzw[1] = _b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[0];
-		result.uxyzw[3] = _a.uxyzw[1];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _b.uxyzw[2];
-		result.uxyzw[1] = _b.uxyzw[3];
-		result.uxyzw[2] = _a.uxyzw[2];
-		result.uxyzw[3] = _a.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[2];
-		result.uxyzw[1] = _a.uxyzw[3];
-		result.uxyzw[2] = _b.uxyzw[2];
-		result.uxyzw[3] = _b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0];
-		result.uxyzw[1] = _b.uxyzw[0];
-		result.uxyzw[2] = _a.uxyzw[1];
-		result.uxyzw[3] = _b.uxyzw[1];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[1];
-		result.uxyzw[1] = _b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[0];
-		result.uxyzw[3] = _b.uxyzw[0];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[2];
-		result.uxyzw[1] = _b.uxyzw[2];
-		result.uxyzw[2] = _a.uxyzw[3];
-		result.uxyzw[3] = _b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _b.uxyzw[2];
-		result.uxyzw[1] = _a.uxyzw[2];
-		result.uxyzw[2] = _b.uxyzw[3];
-		result.uxyzw[3] = _a.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float float4_x(float4_t _a)
-	{
-		return _a.fxyzw[0];
-	}
-
-	BX_FLOAT4_INLINE float float4_y(float4_t _a)
-	{
-		return _a.fxyzw[1];
-	}
-
-	BX_FLOAT4_INLINE float float4_z(float4_t _a)
-	{
-		return _a.fxyzw[2];
-	}
-
-	BX_FLOAT4_INLINE float float4_w(float4_t _a)
-	{
-		return _a.fxyzw[3];
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
-	{
-		return *reinterpret_cast<const float4_t*>(_ptr);
-	}
-
-	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
-	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
-	}
-
-	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
-	{
-		*reinterpret_cast<float4_t*>(_ptr) = _a;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
-	{
-		float4_t result;
-		result.fxyzw[0] = _x;
-		result.fxyzw[1] = _y;
-		result.fxyzw[2] = _z;
-		result.fxyzw[3] = _w;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
-	{
-		float4_t result;
-		result.uxyzw[0] = _x;
-		result.uxyzw[1] = _y;
-		result.uxyzw[2] = _z;
-		result.uxyzw[3] = _w;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
-	{
-		float val = *reinterpret_cast<const float*>(_ptr);
-		return float4_ld(val, val, val, val);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
-	{
-		return float4_ld(_a, _a, _a, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
-	{
-		return float4_ild(_a, _a, _a, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_zero()
-	{
-		return float4_ild(0, 0, 0, 0);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
-	{
-		float4_t result;
-		result.fxyzw[0] = (float)result.ixyzw[0];
-		result.fxyzw[1] = (float)result.ixyzw[1];
-		result.fxyzw[2] = (float)result.ixyzw[2];
-		result.fxyzw[3] = (float)result.ixyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
-	{
-		float4_t result;
-		result.ixyzw[0] = (int)result.fxyzw[0];
-		result.ixyzw[1] = (int)result.fxyzw[1];
-		result.ixyzw[2] = (int)result.fxyzw[2];
-		result.ixyzw[3] = (int)result.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
-	{
-		const float4_t tmp    = float4_ftoi(_a);
-		const float4_t result = float4_itof(tmp);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
-	{
-		float4_t result;
-		result.fxyzw[0] = 1.0f / _a.fxyzw[0];
-		result.fxyzw[1] = 1.0f / _a.fxyzw[1];
-		result.fxyzw[2] = 1.0f / _a.fxyzw[2];
-		result.fxyzw[3] = 1.0f / _a.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
-	{
-		float4_t result;
-		result.fxyzw[0] = sqrtf(_a.fxyzw[0]);
-		result.fxyzw[1] = sqrtf(_a.fxyzw[1]);
-		result.fxyzw[2] = sqrtf(_a.fxyzw[2]);
-		result.fxyzw[3] = sqrtf(_a.fxyzw[3]);
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
-	{
-		float4_t result;
-		result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]);
-		result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]);
-		result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]);
-		result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]);
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0;
-		result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0;
-		result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0;
-		result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0;
-		result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0;
-		result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0;
-		result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0;
-		result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0;
-		result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0;
-		result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0;
-		result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0;
-		result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0;
-		result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0;
-		result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0;
-		result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0;
-		result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
-		result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
-		result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
-		result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0];
-		result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2];
-		result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0];
-		result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2];
-		result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0];
-		result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2];
-		result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0];
-		result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1];
-		result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2];
-		result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] << _count;
-		result.uxyzw[1] = _a.uxyzw[1] << _count;
-		result.uxyzw[2] = _a.uxyzw[2] << _count;
-		result.uxyzw[3] = _a.uxyzw[3] << _count;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
-	{
-		float4_t result;
-		result.uxyzw[0] = _a.uxyzw[0] >> _count;
-		result.uxyzw[1] = _a.uxyzw[1] >> _count;
-		result.uxyzw[2] = _a.uxyzw[2] >> _count;
-		result.uxyzw[3] = _a.uxyzw[3] >> _count;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.ixyzw[0] >> _count;
-		result.ixyzw[1] = _a.ixyzw[1] >> _count;
-		result.ixyzw[2] = _a.ixyzw[2] >> _count;
-		result.ixyzw[3] = _a.ixyzw[3] >> _count;
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0];
-		result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1];
-		result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2];
-		result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3];
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
-	{
-		float4_t result;
-		result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0];
-		result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1];
-		result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2];
-		result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3];
-		return result;
-	}
-
-} // namespace bx
-
-#define float4_shuf_xAzC float4_shuf_xAzC_ni
-#define float4_shuf_yBwD float4_shuf_yBwD_ni
-#define float4_rcp float4_rcp_ni
-#define float4_orx float4_orx_ni
-#define float4_orc float4_orc_ni
-#define float4_neg float4_neg_ni
-#define float4_madd float4_madd_ni
-#define float4_nmsub float4_nmsub_ni
-#define float4_div_nr float4_div_nr_ni
-#define float4_selb float4_selb_ni
-#define float4_sels float4_sels_ni
-#define float4_not float4_not_ni
-#define float4_abs float4_abs_ni
-#define float4_clamp float4_clamp_ni
-#define float4_lerp float4_lerp_ni
-#define float4_rsqrt float4_rsqrt_ni
-#define float4_rsqrt_nr float4_rsqrt_nr_ni
-#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
-#define float4_sqrt_nr float4_sqrt_nr_ni
-#define float4_log2 float4_log2_ni
-#define float4_exp2 float4_exp2_ni
-#define float4_pow float4_pow_ni
-#define float4_cross3 float4_cross3_ni
-#define float4_normalize3 float4_normalize3_ni
-#define float4_dot3 float4_dot3_ni
-#define float4_dot float4_dot_ni
-#define float4_ceil float4_ceil_ni
-#define float4_floor float4_floor_ni
-#include "float4_ni.h"
-
-#endif // __BX_FLOAT4_REF_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_REF_H__
+#define __BX_FLOAT4_REF_H__
+
+#include <math.h> // sqrtf
+
+namespace bx
+{
+	typedef union float4_t
+	{
+		int32_t  ixyzw[4];
+		uint32_t uxyzw[4];
+		float    fxyzw[4];
+
+	} float4_t;
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				float4_t result; \
+				result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \
+				result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \
+				result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \
+				result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \
+				return result; \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+#define IMPLEMENT_TEST(_xyzw, _mask) \
+			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
+			{ \
+				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
+				             | ( (_test.uxyzw[2]>>31)<<2) \
+				             | ( (_test.uxyzw[1]>>31)<<1) \
+				             | (_test.uxyzw[0]>>31) \
+				             ; \
+				return 0 != (tmp&(_mask) ); \
+			} \
+			\
+			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
+			{ \
+				uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \
+				             | ( (_test.uxyzw[2]>>31)<<2) \
+				             | ( (_test.uxyzw[1]>>31)<<1) \
+				             | (_test.uxyzw[0]>>31) \
+				             ; \
+				return (_mask) == (tmp&(_mask) ); \
+			}
+
+IMPLEMENT_TEST(x    , 0x1);
+IMPLEMENT_TEST(y    , 0x2);
+IMPLEMENT_TEST(xy   , 0x3);
+IMPLEMENT_TEST(z    , 0x4);
+IMPLEMENT_TEST(xz   , 0x5);
+IMPLEMENT_TEST(yz   , 0x6);
+IMPLEMENT_TEST(xyz  , 0x7);
+IMPLEMENT_TEST(w    , 0x8);
+IMPLEMENT_TEST(xw   , 0x9);
+IMPLEMENT_TEST(yw   , 0xa);
+IMPLEMENT_TEST(xyw  , 0xb);
+IMPLEMENT_TEST(zw   , 0xc);
+IMPLEMENT_TEST(xzw  , 0xd);
+IMPLEMENT_TEST(yzw  , 0xe);
+IMPLEMENT_TEST(xyzw , 0xf);
+
+#undef IMPLEMENT_TEST
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1];
+		result.uxyzw[2] = _b.uxyzw[0];
+		result.uxyzw[3] = _b.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[0];
+		result.uxyzw[1] = _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[0];
+		result.uxyzw[3] = _a.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[2];
+		result.uxyzw[1] = _b.uxyzw[3];
+		result.uxyzw[2] = _a.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[2];
+		result.uxyzw[1] = _a.uxyzw[3];
+		result.uxyzw[2] = _b.uxyzw[2];
+		result.uxyzw[3] = _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0];
+		result.uxyzw[1] = _b.uxyzw[0];
+		result.uxyzw[2] = _a.uxyzw[1];
+		result.uxyzw[3] = _b.uxyzw[1];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[1];
+		result.uxyzw[1] = _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[0];
+		result.uxyzw[3] = _b.uxyzw[0];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[2];
+		result.uxyzw[1] = _b.uxyzw[2];
+		result.uxyzw[2] = _a.uxyzw[3];
+		result.uxyzw[3] = _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _b.uxyzw[2];
+		result.uxyzw[1] = _a.uxyzw[2];
+		result.uxyzw[2] = _b.uxyzw[3];
+		result.uxyzw[3] = _a.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _a.fxyzw[0];
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		return _a.fxyzw[1];
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		return _a.fxyzw[2];
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		return _a.fxyzw[3];
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+	{
+		return *reinterpret_cast<const float4_t*>(_ptr);
+	}
+
+	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+	{
+		*reinterpret_cast<float4_t*>(_ptr) = _a;
+	}
+
+	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
+	{
+		*reinterpret_cast<float4_t*>(_ptr) = _a;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		float4_t result;
+		result.fxyzw[0] = _x;
+		result.fxyzw[1] = _y;
+		result.fxyzw[2] = _z;
+		result.fxyzw[3] = _w;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		float4_t result;
+		result.uxyzw[0] = _x;
+		result.uxyzw[1] = _y;
+		result.uxyzw[2] = _z;
+		result.uxyzw[3] = _w;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
+	{
+		float val = *reinterpret_cast<const float*>(_ptr);
+		return float4_ld(val, val, val, val);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return float4_ld(_a, _a, _a, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		return float4_ild(_a, _a, _a, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return float4_ild(0, 0, 0, 0);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = (float)result.ixyzw[0];
+		result.fxyzw[1] = (float)result.ixyzw[1];
+		result.fxyzw[2] = (float)result.ixyzw[2];
+		result.fxyzw[3] = (float)result.ixyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
+	{
+		float4_t result;
+		result.ixyzw[0] = (int)result.fxyzw[0];
+		result.ixyzw[1] = (int)result.fxyzw[1];
+		result.ixyzw[2] = (int)result.fxyzw[2];
+		result.ixyzw[3] = (int)result.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
+	{
+		const float4_t tmp    = float4_ftoi(_a);
+		const float4_t result = float4_itof(tmp);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = 1.0f / _a.fxyzw[0];
+		result.fxyzw[1] = 1.0f / _a.fxyzw[1];
+		result.fxyzw[2] = 1.0f / _a.fxyzw[2];
+		result.fxyzw[3] = 1.0f / _a.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = sqrtf(_a.fxyzw[0]);
+		result.fxyzw[1] = sqrtf(_a.fxyzw[1]);
+		result.fxyzw[2] = sqrtf(_a.fxyzw[2]);
+		result.fxyzw[3] = sqrtf(_a.fxyzw[3]);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		float4_t result;
+		result.fxyzw[0] = 1.0f / sqrtf(_a.fxyzw[0]);
+		result.fxyzw[1] = 1.0f / sqrtf(_a.fxyzw[1]);
+		result.fxyzw[2] = 1.0f / sqrtf(_a.fxyzw[2]);
+		result.fxyzw[3] = 1.0f / sqrtf(_a.fxyzw[3]);
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0;
+		result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0;
+		result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0;
+		result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0];
+		result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1];
+		result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2];
+		result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0];
+		result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1];
+		result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2];
+		result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] << _count;
+		result.uxyzw[1] = _a.uxyzw[1] << _count;
+		result.uxyzw[2] = _a.uxyzw[2] << _count;
+		result.uxyzw[3] = _a.uxyzw[3] << _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.uxyzw[0] = _a.uxyzw[0] >> _count;
+		result.uxyzw[1] = _a.uxyzw[1] >> _count;
+		result.uxyzw[2] = _a.uxyzw[2] >> _count;
+		result.uxyzw[3] = _a.uxyzw[3] >> _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] >> _count;
+		result.ixyzw[1] = _a.ixyzw[1] >> _count;
+		result.ixyzw[2] = _a.ixyzw[2] >> _count;
+		result.ixyzw[3] = _a.ixyzw[3] >> _count;
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0];
+		result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1];
+		result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2];
+		result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3];
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		float4_t result;
+		result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0];
+		result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1];
+		result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2];
+		result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3];
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_shuf_xAzC float4_shuf_xAzC_ni
+#define float4_shuf_yBwD float4_shuf_yBwD_ni
+#define float4_rcp float4_rcp_ni
+#define float4_orx float4_orx_ni
+#define float4_orc float4_orc_ni
+#define float4_neg float4_neg_ni
+#define float4_madd float4_madd_ni
+#define float4_nmsub float4_nmsub_ni
+#define float4_div_nr float4_div_nr_ni
+#define float4_selb float4_selb_ni
+#define float4_sels float4_sels_ni
+#define float4_not float4_not_ni
+#define float4_abs float4_abs_ni
+#define float4_clamp float4_clamp_ni
+#define float4_lerp float4_lerp_ni
+#define float4_rsqrt float4_rsqrt_ni
+#define float4_rsqrt_nr float4_rsqrt_nr_ni
+#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
+#define float4_sqrt_nr float4_sqrt_nr_ni
+#define float4_log2 float4_log2_ni
+#define float4_exp2 float4_exp2_ni
+#define float4_pow float4_pow_ni
+#define float4_cross3 float4_cross3_ni
+#define float4_normalize3 float4_normalize3_ni
+#define float4_dot3 float4_dot3_ni
+#define float4_dot float4_dot_ni
+#define float4_ceil float4_ceil_ni
+#define float4_floor float4_floor_ni
+#include "float4_ni.h"
+
+#endif // __BX_FLOAT4_REF_H__
diff --git a/include/bx/float4_sse.h b/include/bx/float4_sse.h
index 7936298..4e33781 100644
--- a/include/bx/float4_sse.h
+++ b/include/bx/float4_sse.h
@@ -1,401 +1,401 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4_SSE_H__
-#define __BX_FLOAT4_SSE_H__
-
-#include <emmintrin.h> // __m128i
-#if defined(__SSE4_1__)
-#	include <smmintrin.h>
-#endif // defined(__SSE4_1__)
-#include <xmmintrin.h> // __m128
-
-namespace bx
-{
-
-	typedef __m128 float4_t;
-
-#define ELEMx 0
-#define ELEMy 1
-#define ELEMz 2
-#define ELEMw 3
-#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
-			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
-			{ \
-				return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
-			}
-
-#include "float4_swizzle.inl"
-
-#undef IMPLEMENT_SWIZZLE
-#undef ELEMw
-#undef ELEMz
-#undef ELEMy
-#undef ELEMx
-
-#define IMPLEMENT_TEST(_xyzw, _mask) \
-			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
-			{ \
-				return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
-			} \
-			\
-			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
-			{ \
-				return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
-			}
-
-IMPLEMENT_TEST(x    , 0x1);
-IMPLEMENT_TEST(y    , 0x2);
-IMPLEMENT_TEST(xy   , 0x3);
-IMPLEMENT_TEST(z    , 0x4);
-IMPLEMENT_TEST(xz   , 0x5);
-IMPLEMENT_TEST(yz   , 0x6);
-IMPLEMENT_TEST(xyz  , 0x7);
-IMPLEMENT_TEST(w    , 0x8);
-IMPLEMENT_TEST(xw   , 0x9);
-IMPLEMENT_TEST(yw   , 0xa);
-IMPLEMENT_TEST(xyw  , 0xb);
-IMPLEMENT_TEST(zw   , 0xc);
-IMPLEMENT_TEST(xzw  , 0xd);
-IMPLEMENT_TEST(yzw  , 0xe);
-IMPLEMENT_TEST(xyzw , 0xf);
-
-#undef IMPLEMENT_TEST
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
-	{
-		return _mm_movelh_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
-	{
-		return _mm_movelh_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
-	{
-		return _mm_movehl_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
-	{
-		return _mm_movehl_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
-	{
-		return _mm_unpacklo_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
-	{
-		return _mm_unpacklo_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
-	{
-		return _mm_unpackhi_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
-	{
-		return _mm_unpackhi_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float float4_x(float4_t _a)
-	{
-		return _mm_cvtss_f32(_a);
-	}
-
-	BX_FLOAT4_INLINE float float4_y(float4_t _a)
-	{
-		const float4_t yyyy = float4_swiz_yyyy(_a);
-		const float result  = _mm_cvtss_f32(yyyy);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float float4_z(float4_t _a)
-	{
-		const float4_t zzzz = float4_swiz_zzzz(_a);
-		const float result  = _mm_cvtss_f32(zzzz);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float float4_w(float4_t _a)
-	{
-		const float4_t wwww = float4_swiz_wwww(_a);
-		const float result  = _mm_cvtss_f32(wwww);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
-	{
-		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
-	}
-
-	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
-	{
-		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
-	}
-
-	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
-	{
-		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
-	{
-		return _mm_set_ps(_w, _z, _y, _x);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
-	{
-		const __m128i set     = _mm_set_epi32(_w, _z, _y, _x);
-		const float4_t result = _mm_castsi128_ps(set);
-		
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
-	{
-		const float4_t x___   = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
-		const float4_t result = float4_swiz_xxxx(x___);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
-	{
-		return _mm_set1_ps(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
-	{
-		const __m128i splat   = _mm_set1_epi32(_a);
-		const float4_t result = _mm_castsi128_ps(splat);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_zero()
-	{
-		return _mm_setzero_ps();
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
-	{
-		const __m128i  itof   = _mm_castps_si128(_a);
-		const float4_t result = _mm_cvtepi32_ps(itof);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
-	{
-		const __m128i ftoi    = _mm_cvtps_epi32(_a);
-		const float4_t result = _mm_castsi128_ps(ftoi);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
-	{
-#if defined(__SSE4_1__)
-		return _mm_round_ps(_a, _MM_FROUND_NINT);
-#else
-		const __m128i round   = _mm_cvtps_epi32(_a);
-		const float4_t result = _mm_cvtepi32_ps(round);
-
-		return result;
-#endif // defined(__SSE4_1__)
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
-	{
-		return _mm_add_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
-	{
-		return _mm_sub_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
-	{
-		return _mm_mul_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
-	{
-		return _mm_div_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
-	{
-		return _mm_rcp_ps(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
-	{
-		return _mm_sqrt_ps(_a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
-	{
-		return _mm_rsqrt_ps(_a);
-	}
-
-#if defined(__SSE4_1__)
-	BX_FLOAT4_INLINE float4_t float4_dot3(float4_t _a, float4_t _b)
-	{
-		return _mm_dp_ps(_a, _b, 0x77);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_dot(float4_t _a, float4_t _b)
-	{
-		return _mm_dp_ps(_a, _b, 0xFF);
-	}
-#endif // defined(__SSE4__)
-
-	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
-	{
-		return _mm_cmpeq_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
-	{
-		return _mm_cmplt_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
-	{
-		return _mm_cmple_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
-	{
-		return _mm_cmpgt_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
-	{
-		return _mm_cmpge_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
-	{
-		return _mm_min_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
-	{
-		return _mm_max_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
-	{
-		return _mm_and_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
-	{
-		return _mm_andnot_ps(_b, _a);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
-	{
-		return _mm_or_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
-	{
-		return _mm_xor_ps(_a, _b);
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
-	{
-		const __m128i a       = _mm_castps_si128(_a);
-		const __m128i shift   = _mm_slli_epi32(a, _count);
-		const float4_t result = _mm_castsi128_ps(shift);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
-	{
-		const __m128i a       = _mm_castps_si128(_a);
-		const __m128i shift   = _mm_srli_epi32(a, _count);
-		const float4_t result = _mm_castsi128_ps(shift);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
-	{
-		const __m128i a       = _mm_castps_si128(_a);
-		const __m128i shift   = _mm_srai_epi32(a, _count);
-		const float4_t result = _mm_castsi128_ps(shift);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
-	{
-		const __m128i a       = _mm_castps_si128(_a);
-		const __m128i b       = _mm_castps_si128(_b);
-		const __m128i add     = _mm_add_epi32(a, b);
-		const float4_t result = _mm_castsi128_ps(add);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
-	{
-		const __m128i a       = _mm_castps_si128(_a);
-		const __m128i b       = _mm_castps_si128(_b);
-		const __m128i sub     = _mm_sub_epi32(a, b);
-		const float4_t result = _mm_castsi128_ps(sub);
-
-		return result;
-	}
-
-} // namespace bx
-
-#define float4_shuf_xAzC float4_shuf_xAzC_ni
-#define float4_shuf_yBwD float4_shuf_yBwD_ni
-#define float4_rcp float4_rcp_ni
-#define float4_orx float4_orx_ni
-#define float4_orc float4_orc_ni
-#define float4_neg float4_neg_ni
-#define float4_madd float4_madd_ni
-#define float4_nmsub float4_nmsub_ni
-#define float4_div_nr float4_div_nr_ni
-#define float4_selb float4_selb_ni
-#define float4_sels float4_sels_ni
-#define float4_not float4_not_ni
-#define float4_abs float4_abs_ni
-#define float4_clamp float4_clamp_ni
-#define float4_lerp float4_lerp_ni
-#define float4_rsqrt float4_rsqrt_ni
-#define float4_rsqrt_nr float4_rsqrt_nr_ni
-#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
-#define float4_sqrt_nr float4_sqrt_nr_ni
-#define float4_log2 float4_log2_ni
-#define float4_exp2 float4_exp2_ni
-#define float4_pow float4_pow_ni
-#define float4_cross3 float4_cross3_ni
-#define float4_normalize3 float4_normalize3_ni
-#if !defined(__SSE4_1__)
-#define float4_dot3 float4_dot3_ni
-#define float4_dot float4_dot_ni
-#endif // defined(__SSE4_1__)
-#define float4_ceil float4_ceil_ni
-#define float4_floor float4_floor_ni
-#include "float4_ni.h"
-
-#endif // __FLOAT4_SSE_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_SSE_H__
+#define __BX_FLOAT4_SSE_H__
+
+#include <emmintrin.h> // __m128i
+#if defined(__SSE4_1__)
+#	include <smmintrin.h>
+#endif // defined(__SSE4_1__)
+#include <xmmintrin.h> // __m128
+
+namespace bx
+{
+
+	typedef __m128 float4_t;
+
+#define ELEMx 0
+#define ELEMy 1
+#define ELEMz 2
+#define ELEMw 3
+#define IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \
+			BX_FLOAT4_INLINE float4_t float4_swiz_##_x##_y##_z##_w(float4_t _a) \
+			{ \
+				return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \
+			}
+
+#include "float4_swizzle.inl"
+
+#undef IMPLEMENT_SWIZZLE
+#undef ELEMw
+#undef ELEMz
+#undef ELEMy
+#undef ELEMx
+
+#define IMPLEMENT_TEST(_xyzw, _mask) \
+			BX_FLOAT4_INLINE bool float4_test_any_##_xyzw(float4_t _test) \
+			{ \
+				return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \
+			} \
+			\
+			BX_FLOAT4_INLINE bool float4_test_all_##_xyzw(float4_t _test) \
+			{ \
+				return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \
+			}
+
+IMPLEMENT_TEST(x    , 0x1);
+IMPLEMENT_TEST(y    , 0x2);
+IMPLEMENT_TEST(xy   , 0x3);
+IMPLEMENT_TEST(z    , 0x4);
+IMPLEMENT_TEST(xz   , 0x5);
+IMPLEMENT_TEST(yz   , 0x6);
+IMPLEMENT_TEST(xyz  , 0x7);
+IMPLEMENT_TEST(w    , 0x8);
+IMPLEMENT_TEST(xw   , 0x9);
+IMPLEMENT_TEST(yw   , 0xa);
+IMPLEMENT_TEST(xyw  , 0xb);
+IMPLEMENT_TEST(zw   , 0xc);
+IMPLEMENT_TEST(xzw  , 0xd);
+IMPLEMENT_TEST(yzw  , 0xe);
+IMPLEMENT_TEST(xyzw , 0xf);
+
+#undef IMPLEMENT_TEST
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xyAB(float4_t _a, float4_t _b)
+	{
+		return _mm_movelh_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_ABxy(float4_t _a, float4_t _b)
+	{
+		return _mm_movelh_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CDzw(float4_t _a, float4_t _b)
+	{
+		return _mm_movehl_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zwCD(float4_t _a, float4_t _b)
+	{
+		return _mm_movehl_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_xAyB(float4_t _a, float4_t _b)
+	{
+		return _mm_unpacklo_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_yBxA(float4_t _a, float4_t _b)
+	{
+		return _mm_unpacklo_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_zCwD(float4_t _a, float4_t _b)
+	{
+		return _mm_unpackhi_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_shuf_CzDw(float4_t _a, float4_t _b)
+	{
+		return _mm_unpackhi_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float float4_x(float4_t _a)
+	{
+		return _mm_cvtss_f32(_a);
+	}
+
+	BX_FLOAT4_INLINE float float4_y(float4_t _a)
+	{
+		const float4_t yyyy = float4_swiz_yyyy(_a);
+		const float result  = _mm_cvtss_f32(yyyy);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_z(float4_t _a)
+	{
+		const float4_t zzzz = float4_swiz_zzzz(_a);
+		const float result  = _mm_cvtss_f32(zzzz);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float float4_w(float4_t _a)
+	{
+		const float4_t wwww = float4_swiz_wwww(_a);
+		const float result  = _mm_cvtss_f32(wwww);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(const void* _ptr)
+	{
+		return _mm_load_ps(reinterpret_cast<const float*>(_ptr) );
+	}
+
+	BX_FLOAT4_INLINE void float4_st(void* _ptr, float4_t _a)
+	{
+		_mm_store_ps(reinterpret_cast<float*>(_ptr), _a);
+	}
+
+	BX_FLOAT4_INLINE void float4_stream(void* _ptr, float4_t _a)
+	{
+		_mm_stream_ps(reinterpret_cast<float*>(_ptr), _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ld(float _x, float _y, float _z, float _w)
+	{
+		return _mm_set_ps(_w, _z, _y, _x);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w)
+	{
+		const __m128i set     = _mm_set_epi32(_w, _z, _y, _x);
+		const float4_t result = _mm_castsi128_ps(set);
+		
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(const void* _ptr)
+	{
+		const float4_t x___   = _mm_load_ss(reinterpret_cast<const float*>(_ptr) );
+		const float4_t result = float4_swiz_xxxx(x___);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_splat(float _a)
+	{
+		return _mm_set1_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isplat(uint32_t _a)
+	{
+		const __m128i splat   = _mm_set1_epi32(_a);
+		const float4_t result = _mm_castsi128_ps(splat);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_zero()
+	{
+		return _mm_setzero_ps();
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_itof(float4_t _a)
+	{
+		const __m128i  itof   = _mm_castps_si128(_a);
+		const float4_t result = _mm_cvtepi32_ps(itof);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_ftoi(float4_t _a)
+	{
+		const __m128i ftoi    = _mm_cvtps_epi32(_a);
+		const float4_t result = _mm_castsi128_ps(ftoi);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_round(float4_t _a)
+	{
+#if defined(__SSE4_1__)
+		return _mm_round_ps(_a, _MM_FROUND_NINT);
+#else
+		const __m128i round   = _mm_cvtps_epi32(_a);
+		const float4_t result = _mm_cvtepi32_ps(round);
+
+		return result;
+#endif // defined(__SSE4_1__)
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_add(float4_t _a, float4_t _b)
+	{
+		return _mm_add_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sub(float4_t _a, float4_t _b)
+	{
+		return _mm_sub_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, float4_t _b)
+	{
+		return _mm_mul_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_div(float4_t _a, float4_t _b)
+	{
+		return _mm_div_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rcp_est(float4_t _a)
+	{
+		return _mm_rcp_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sqrt(float4_t _a)
+	{
+		return _mm_sqrt_ps(_a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_rsqrt_est(float4_t _a)
+	{
+		return _mm_rsqrt_ps(_a);
+	}
+
+#if defined(__SSE4_1__)
+	BX_FLOAT4_INLINE float4_t float4_dot3(float4_t _a, float4_t _b)
+	{
+		return _mm_dp_ps(_a, _b, 0x77);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_dot(float4_t _a, float4_t _b)
+	{
+		return _mm_dp_ps(_a, _b, 0xFF);
+	}
+#endif // defined(__SSE4__)
+
+	BX_FLOAT4_INLINE float4_t float4_cmpeq(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpeq_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmplt(float4_t _a, float4_t _b)
+	{
+		return _mm_cmplt_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmple(float4_t _a, float4_t _b)
+	{
+		return _mm_cmple_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpgt(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpgt_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_cmpge(float4_t _a, float4_t _b)
+	{
+		return _mm_cmpge_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_min(float4_t _a, float4_t _b)
+	{
+		return _mm_min_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_max(float4_t _a, float4_t _b)
+	{
+		return _mm_max_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_and(float4_t _a, float4_t _b)
+	{
+		return _mm_and_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_andc(float4_t _a, float4_t _b)
+	{
+		return _mm_andnot_ps(_b, _a);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_or(float4_t _a, float4_t _b)
+	{
+		return _mm_or_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_xor(float4_t _a, float4_t _b)
+	{
+		return _mm_xor_ps(_a, _b);
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sll(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_slli_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_srl(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_srli_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_sra(float4_t _a, int _count)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i shift   = _mm_srai_epi32(a, _count);
+		const float4_t result = _mm_castsi128_ps(shift);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_iadd(float4_t _a, float4_t _b)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i b       = _mm_castps_si128(_b);
+		const __m128i add     = _mm_add_epi32(a, b);
+		const float4_t result = _mm_castsi128_ps(add);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_isub(float4_t _a, float4_t _b)
+	{
+		const __m128i a       = _mm_castps_si128(_a);
+		const __m128i b       = _mm_castps_si128(_b);
+		const __m128i sub     = _mm_sub_epi32(a, b);
+		const float4_t result = _mm_castsi128_ps(sub);
+
+		return result;
+	}
+
+} // namespace bx
+
+#define float4_shuf_xAzC float4_shuf_xAzC_ni
+#define float4_shuf_yBwD float4_shuf_yBwD_ni
+#define float4_rcp float4_rcp_ni
+#define float4_orx float4_orx_ni
+#define float4_orc float4_orc_ni
+#define float4_neg float4_neg_ni
+#define float4_madd float4_madd_ni
+#define float4_nmsub float4_nmsub_ni
+#define float4_div_nr float4_div_nr_ni
+#define float4_selb float4_selb_ni
+#define float4_sels float4_sels_ni
+#define float4_not float4_not_ni
+#define float4_abs float4_abs_ni
+#define float4_clamp float4_clamp_ni
+#define float4_lerp float4_lerp_ni
+#define float4_rsqrt float4_rsqrt_ni
+#define float4_rsqrt_nr float4_rsqrt_nr_ni
+#define float4_rsqrt_carmack float4_rsqrt_carmack_ni
+#define float4_sqrt_nr float4_sqrt_nr_ni
+#define float4_log2 float4_log2_ni
+#define float4_exp2 float4_exp2_ni
+#define float4_pow float4_pow_ni
+#define float4_cross3 float4_cross3_ni
+#define float4_normalize3 float4_normalize3_ni
+#if !defined(__SSE4_1__)
+#define float4_dot3 float4_dot3_ni
+#define float4_dot float4_dot_ni
+#endif // defined(__SSE4_1__)
+#define float4_ceil float4_ceil_ni
+#define float4_floor float4_floor_ni
+#include "float4_ni.h"
+
+#endif // __FLOAT4_SSE_H__
diff --git a/include/bx/float4_t.h b/include/bx/float4_t.h
index 83a0775..21d001f 100644
--- a/include/bx/float4_t.h
+++ b/include/bx/float4_t.h
@@ -1,21 +1,21 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4_T_H__
-#define __BX_FLOAT4_T_H__
-
-#include "bx.h"
-
-#define BX_FLOAT4_INLINE BX_FORCE_INLINE
-
-#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
-#	include "float4_sse.h"
-#elif 0 // __ARM_NEON__
-#	include "float4_neon.h"
-#else
-#	include "float4_ref.h"
-#endif //
-
-#endif // __BX_FLOAT4_T_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4_T_H__
+#define __BX_FLOAT4_T_H__
+
+#include "bx.h"
+
+#define BX_FLOAT4_INLINE BX_FORCE_INLINE
+
+#if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) )
+#	include "float4_sse.h"
+#elif 0 // __ARM_NEON__
+#	include "float4_neon.h"
+#else
+#	include "float4_ref.h"
+#endif //
+
+#endif // __BX_FLOAT4_T_H__
diff --git a/include/bx/float4x4_t.h b/include/bx/float4x4_t.h
index a552425..d99c3c1 100644
--- a/include/bx/float4x4_t.h
+++ b/include/bx/float4x4_t.h
@@ -1,168 +1,168 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FLOAT4X4_H__
-#define __BX_FLOAT4x4_H__
-
-#include "float4_t.h"
-
-namespace bx
-{
-	typedef BX_ALIGN_STRUCT_16(struct)
-	{
-		float4_t col[4];
-
-	} float4x4_t;
-
-	BX_FLOAT4_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t& _b)
-	{
-		const float4_t xxxx   = float4_swiz_xxxx(_a);
-		const float4_t yyyy   = float4_swiz_yyyy(_a);
-		const float4_t zzzz   = float4_swiz_zzzz(_a);
-		const float4_t col0   = float4_mul(_b.col[0], xxxx);
-		const float4_t col1   = float4_mul(_b.col[1], yyyy);
-		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
-		const float4_t col3   = float4_add(_b.col[3], col1);
-		const float4_t result = float4_add(col2, col3);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, const float4x4_t& _b)
-	{
-		const float4_t xxxx   = float4_swiz_xxxx(_a);
-		const float4_t yyyy   = float4_swiz_yyyy(_a);
-		const float4_t zzzz   = float4_swiz_zzzz(_a);
-		const float4_t wwww   = float4_swiz_wwww(_a);
-		const float4_t col0   = float4_mul(_b.col[0], xxxx);
-		const float4_t col1   = float4_mul(_b.col[1], yyyy);
-		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
-		const float4_t col3   = float4_madd(_b.col[3], wwww, col1);
-		const float4_t result = float4_add(col2, col3);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4x4_t float4x4_mul(const float4x4_t& _a, const float4x4_t& _b)
-	{
-		float4x4_t result;
-		result.col[0] = float4_mul(_a.col[0], _b);
-		result.col[1] = float4_mul(_a.col[1], _b);
-		result.col[2] = float4_mul(_a.col[2], _b);
-		result.col[3] = float4_mul(_a.col[3], _b);
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4x4_t float4x4_transpose(const float4x4_t& _mtx)
-	{
-		const float4_t aibj = float4_shuf_xAyB(_mtx.col[0], _mtx.col[2]); // aibj
-		const float4_t emfn = float4_shuf_xAyB(_mtx.col[1], _mtx.col[3]); // emfn
-		const float4_t ckdl = float4_shuf_zCwD(_mtx.col[0], _mtx.col[2]); // ckdl
-		const float4_t gohp = float4_shuf_zCwD(_mtx.col[1], _mtx.col[3]); // gohp
-		float4x4_t result;
-		result.col[0] = float4_shuf_xAyB(aibj, emfn); // aeim
-		result.col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn
-		result.col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko
-		result.col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp
-
-		return result;
-	}
-
-	BX_FLOAT4_INLINE float4x4_t float4x4_inverse(const float4x4_t& _a)
-	{
-		const float4_t tmp0 = float4_shuf_xAzC(_a.col[0], _a.col[1]);
-		const float4_t tmp1 = float4_shuf_xAzC(_a.col[2], _a.col[3]);
-		const float4_t tmp2 = float4_shuf_yBwD(_a.col[0], _a.col[1]);
-		const float4_t tmp3 = float4_shuf_yBwD(_a.col[2], _a.col[3]);
-		const float4_t t0   = float4_shuf_xyAB(tmp0, tmp1);
-		const float4_t t1   = float4_shuf_xyAB(tmp3, tmp2);
-		const float4_t t2   = float4_shuf_zwCD(tmp0, tmp1);
-		const float4_t t3   = float4_shuf_zwCD(tmp3, tmp2);
-
-		const float4_t t23 = float4_mul(t2, t3);
-		const float4_t t23_yxwz = float4_swiz_yxwz(t23);
-		const float4_t t23_wzyx = float4_swiz_wzyx(t23);
-
-		float4_t cof0, cof1, cof2, cof3;
-
-		const float4_t zero = float4_zero();
-		cof0 = float4_nmsub(t1, t23_yxwz, zero);
-		cof0 = float4_madd(t1, t23_wzyx, cof0);
-
-		cof1 = float4_nmsub(t0, t23_yxwz, zero);
-		cof1 = float4_madd(t0, t23_wzyx, cof1);
-		cof1 = float4_swiz_zwxy(cof1);
-		
-		const float4_t t12 = float4_mul(t1, t2);
-		const float4_t t12_yxwz = float4_swiz_yxwz(t12);
-		const float4_t t12_wzyx = float4_swiz_wzyx(t12);
-		
-		cof0 = float4_madd(t3, t12_yxwz, cof0);
-		cof0 = float4_nmsub(t3, t12_wzyx, cof0);
-
-		cof3 = float4_mul(t0, t12_yxwz);
-		cof3 = float4_nmsub(t0, t12_wzyx, cof3);
-		cof3 = float4_swiz_zwxy(cof3);
-
-		const float4_t t1_zwxy = float4_swiz_zwxy(t1);
-		const float4_t t2_zwxy = float4_swiz_zwxy(t2);
-
-		const float4_t t13 = float4_mul(t1_zwxy, t3);
-		const float4_t t13_yxwz = float4_swiz_yxwz(t13);
-		const float4_t t13_wzyx = float4_swiz_wzyx(t13);
-
-		cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0);
-		cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0);
-
-		cof2 = float4_mul(t0, t13_yxwz);
-		cof2 = float4_nmsub(t0, t13_wzyx, cof2);
-		cof2 = float4_swiz_zwxy(cof2);
-
-		const float4_t t01 = float4_mul(t0, t1);
-		const float4_t t01_yxwz = float4_swiz_yxwz(t01);
-		const float4_t t01_wzyx = float4_swiz_wzyx(t01);
-
-		cof2 = float4_nmsub(t3, t01_yxwz, cof2);
-		cof2 = float4_madd(t3, t01_wzyx, cof2);
-
-		cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3);
-		cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3);
-
-		const float4_t t03 = float4_mul(t0, t3);
-		const float4_t t03_yxwz = float4_swiz_yxwz(t03);
-		const float4_t t03_wzyx = float4_swiz_wzyx(t03);
-
-		cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1);
-		cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1);
-
-		cof2 = float4_madd(t1, t03_yxwz, cof2);
-		cof2 = float4_nmsub(t1, t03_wzyx, cof2);
-
-		const float4_t t02 = float4_mul(t0, t2_zwxy);
-		const float4_t t02_yxwz = float4_swiz_yxwz(t02);
-		const float4_t t02_wzyx = float4_swiz_wzyx(t02);
-
-		cof1 = float4_madd(t3, t02_yxwz, cof1);
-		cof1 = float4_nmsub(t3, t02_wzyx, cof1);
-
-		cof3 = float4_nmsub(t1, t02_yxwz, cof3);
-		cof3 = float4_madd(t1, t02_wzyx, cof3);
-
-		const float4_t det    = float4_dot(t0, cof0);
-		const float4_t invdet = float4_rcp(det);
-
-		float4x4_t result;
-		result.col[0] = float4_mul(cof0, invdet);
-		result.col[1] = float4_mul(cof1, invdet);
-		result.col[2] = float4_mul(cof2, invdet);
-		result.col[3] = float4_mul(cof3, invdet);
-
-		return result;
-	}
-
-} // namespace bx
-
-#endif // __BX_FLOAT4X4_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FLOAT4X4_H__
+#define __BX_FLOAT4x4_H__
+
+#include "float4_t.h"
+
+namespace bx
+{
+	typedef BX_ALIGN_STRUCT_16(struct)
+	{
+		float4_t col[4];
+
+	} float4x4_t;
+
+	BX_FLOAT4_INLINE float4_t float4_mul_xyz1(float4_t _a, const float4x4_t& _b)
+	{
+		const float4_t xxxx   = float4_swiz_xxxx(_a);
+		const float4_t yyyy   = float4_swiz_yyyy(_a);
+		const float4_t zzzz   = float4_swiz_zzzz(_a);
+		const float4_t col0   = float4_mul(_b.col[0], xxxx);
+		const float4_t col1   = float4_mul(_b.col[1], yyyy);
+		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
+		const float4_t col3   = float4_add(_b.col[3], col1);
+		const float4_t result = float4_add(col2, col3);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4_t float4_mul(float4_t _a, const float4x4_t& _b)
+	{
+		const float4_t xxxx   = float4_swiz_xxxx(_a);
+		const float4_t yyyy   = float4_swiz_yyyy(_a);
+		const float4_t zzzz   = float4_swiz_zzzz(_a);
+		const float4_t wwww   = float4_swiz_wwww(_a);
+		const float4_t col0   = float4_mul(_b.col[0], xxxx);
+		const float4_t col1   = float4_mul(_b.col[1], yyyy);
+		const float4_t col2   = float4_madd(_b.col[2], zzzz, col0);
+		const float4_t col3   = float4_madd(_b.col[3], wwww, col1);
+		const float4_t result = float4_add(col2, col3);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_mul(const float4x4_t& _a, const float4x4_t& _b)
+	{
+		float4x4_t result;
+		result.col[0] = float4_mul(_a.col[0], _b);
+		result.col[1] = float4_mul(_a.col[1], _b);
+		result.col[2] = float4_mul(_a.col[2], _b);
+		result.col[3] = float4_mul(_a.col[3], _b);
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_transpose(const float4x4_t& _mtx)
+	{
+		const float4_t aibj = float4_shuf_xAyB(_mtx.col[0], _mtx.col[2]); // aibj
+		const float4_t emfn = float4_shuf_xAyB(_mtx.col[1], _mtx.col[3]); // emfn
+		const float4_t ckdl = float4_shuf_zCwD(_mtx.col[0], _mtx.col[2]); // ckdl
+		const float4_t gohp = float4_shuf_zCwD(_mtx.col[1], _mtx.col[3]); // gohp
+		float4x4_t result;
+		result.col[0] = float4_shuf_xAyB(aibj, emfn); // aeim
+		result.col[1] = float4_shuf_zCwD(aibj, emfn); // bfjn
+		result.col[2] = float4_shuf_xAyB(ckdl, gohp); // cgko
+		result.col[3] = float4_shuf_zCwD(ckdl, gohp); // dhlp
+
+		return result;
+	}
+
+	BX_FLOAT4_INLINE float4x4_t float4x4_inverse(const float4x4_t& _a)
+	{
+		const float4_t tmp0 = float4_shuf_xAzC(_a.col[0], _a.col[1]);
+		const float4_t tmp1 = float4_shuf_xAzC(_a.col[2], _a.col[3]);
+		const float4_t tmp2 = float4_shuf_yBwD(_a.col[0], _a.col[1]);
+		const float4_t tmp3 = float4_shuf_yBwD(_a.col[2], _a.col[3]);
+		const float4_t t0   = float4_shuf_xyAB(tmp0, tmp1);
+		const float4_t t1   = float4_shuf_xyAB(tmp3, tmp2);
+		const float4_t t2   = float4_shuf_zwCD(tmp0, tmp1);
+		const float4_t t3   = float4_shuf_zwCD(tmp3, tmp2);
+
+		const float4_t t23 = float4_mul(t2, t3);
+		const float4_t t23_yxwz = float4_swiz_yxwz(t23);
+		const float4_t t23_wzyx = float4_swiz_wzyx(t23);
+
+		float4_t cof0, cof1, cof2, cof3;
+
+		const float4_t zero = float4_zero();
+		cof0 = float4_nmsub(t1, t23_yxwz, zero);
+		cof0 = float4_madd(t1, t23_wzyx, cof0);
+
+		cof1 = float4_nmsub(t0, t23_yxwz, zero);
+		cof1 = float4_madd(t0, t23_wzyx, cof1);
+		cof1 = float4_swiz_zwxy(cof1);
+		
+		const float4_t t12 = float4_mul(t1, t2);
+		const float4_t t12_yxwz = float4_swiz_yxwz(t12);
+		const float4_t t12_wzyx = float4_swiz_wzyx(t12);
+		
+		cof0 = float4_madd(t3, t12_yxwz, cof0);
+		cof0 = float4_nmsub(t3, t12_wzyx, cof0);
+
+		cof3 = float4_mul(t0, t12_yxwz);
+		cof3 = float4_nmsub(t0, t12_wzyx, cof3);
+		cof3 = float4_swiz_zwxy(cof3);
+
+		const float4_t t1_zwxy = float4_swiz_zwxy(t1);
+		const float4_t t2_zwxy = float4_swiz_zwxy(t2);
+
+		const float4_t t13 = float4_mul(t1_zwxy, t3);
+		const float4_t t13_yxwz = float4_swiz_yxwz(t13);
+		const float4_t t13_wzyx = float4_swiz_wzyx(t13);
+
+		cof0 = float4_madd(t2_zwxy, t13_yxwz, cof0);
+		cof0 = float4_nmsub(t2_zwxy, t13_wzyx, cof0);
+
+		cof2 = float4_mul(t0, t13_yxwz);
+		cof2 = float4_nmsub(t0, t13_wzyx, cof2);
+		cof2 = float4_swiz_zwxy(cof2);
+
+		const float4_t t01 = float4_mul(t0, t1);
+		const float4_t t01_yxwz = float4_swiz_yxwz(t01);
+		const float4_t t01_wzyx = float4_swiz_wzyx(t01);
+
+		cof2 = float4_nmsub(t3, t01_yxwz, cof2);
+		cof2 = float4_madd(t3, t01_wzyx, cof2);
+
+		cof3 = float4_madd(t2_zwxy, t01_yxwz, cof3);
+		cof3 = float4_nmsub(t2_zwxy, t01_wzyx, cof3);
+
+		const float4_t t03 = float4_mul(t0, t3);
+		const float4_t t03_yxwz = float4_swiz_yxwz(t03);
+		const float4_t t03_wzyx = float4_swiz_wzyx(t03);
+
+		cof1 = float4_nmsub(t2_zwxy, t03_yxwz, cof1);
+		cof1 = float4_madd(t2_zwxy, t03_wzyx, cof1);
+
+		cof2 = float4_madd(t1, t03_yxwz, cof2);
+		cof2 = float4_nmsub(t1, t03_wzyx, cof2);
+
+		const float4_t t02 = float4_mul(t0, t2_zwxy);
+		const float4_t t02_yxwz = float4_swiz_yxwz(t02);
+		const float4_t t02_wzyx = float4_swiz_wzyx(t02);
+
+		cof1 = float4_madd(t3, t02_yxwz, cof1);
+		cof1 = float4_nmsub(t3, t02_wzyx, cof1);
+
+		cof3 = float4_nmsub(t1, t02_yxwz, cof3);
+		cof3 = float4_madd(t1, t02_wzyx, cof3);
+
+		const float4_t det    = float4_dot(t0, cof0);
+		const float4_t invdet = float4_rcp(det);
+
+		float4x4_t result;
+		result.col[0] = float4_mul(cof0, invdet);
+		result.col[1] = float4_mul(cof1, invdet);
+		result.col[2] = float4_mul(cof2, invdet);
+		result.col[3] = float4_mul(cof3, invdet);
+
+		return result;
+	}
+
+} // namespace bx
+
+#endif // __BX_FLOAT4X4_H__
diff --git a/include/bx/foreach.h b/include/bx/foreach.h
index 76ba6fb..00aeb56 100644
--- a/include/bx/foreach.h
+++ b/include/bx/foreach.h
@@ -1,71 +1,71 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_FOREACH_H__
-#define __BX_FOREACH_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	namespace foreach_ns
-	{
-		struct ContainerBase
-		{
-		};
-
-		template <typename Ty>
-		class Container : public ContainerBase
-		{
-		public:
-			inline Container(const Ty& _container)
-				: m_container(_container)
-				, m_break(0)
-				, m_it( _container.begin() )
-				, m_itEnd( _container.end() )
-			{
-			}
-
-			inline bool condition() const
-			{
-				return (!m_break++ && m_it != m_itEnd);
-			}
-
-			const Ty& m_container;
-			mutable int m_break;
-			mutable typename Ty::const_iterator m_it;
-			mutable typename Ty::const_iterator m_itEnd;
-		};
-
-		template <typename Ty>
-		inline Ty* pointer(const Ty&)
-		{
-			return 0;
-		}
-
-		template <typename Ty>
-		inline Container<Ty> containerNew(const Ty& _container)
-		{
-			return Container<Ty>(_container);
-		}
-
-		template <typename Ty>
-		inline const Container<Ty>* container(const ContainerBase* _base, const Ty*)
-		{
-			return static_cast<const Container<Ty>*>(_base);
-		}
-	} // namespace foreach_ns
-
-#define foreach(_variable, _container) \
-	for (const bx::foreach_ns::ContainerBase &__temp_container__ = bx::foreach_ns::containerNew(_container); \
-			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->condition(); \
-			++bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_it) \
-	for (_variable = *container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_it; \
-			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_break; \
-			--bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_break)
-
-} // namespace bx
-
-#endif // __BX_FOREACH_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_FOREACH_H__
+#define __BX_FOREACH_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	namespace foreach_ns
+	{
+		struct ContainerBase
+		{
+		};
+
+		template <typename Ty>
+		class Container : public ContainerBase
+		{
+		public:
+			inline Container(const Ty& _container)
+				: m_container(_container)
+				, m_break(0)
+				, m_it( _container.begin() )
+				, m_itEnd( _container.end() )
+			{
+			}
+
+			inline bool condition() const
+			{
+				return (!m_break++ && m_it != m_itEnd);
+			}
+
+			const Ty& m_container;
+			mutable int m_break;
+			mutable typename Ty::const_iterator m_it;
+			mutable typename Ty::const_iterator m_itEnd;
+		};
+
+		template <typename Ty>
+		inline Ty* pointer(const Ty&)
+		{
+			return 0;
+		}
+
+		template <typename Ty>
+		inline Container<Ty> containerNew(const Ty& _container)
+		{
+			return Container<Ty>(_container);
+		}
+
+		template <typename Ty>
+		inline const Container<Ty>* container(const ContainerBase* _base, const Ty*)
+		{
+			return static_cast<const Container<Ty>*>(_base);
+		}
+	} // namespace foreach_ns
+
+#define foreach(_variable, _container) \
+	for (const bx::foreach_ns::ContainerBase &__temp_container__ = bx::foreach_ns::containerNew(_container); \
+			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->condition(); \
+			++bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_it) \
+	for (_variable = *container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_it; \
+			bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_break; \
+			--bx::foreach_ns::container(&__temp_container__, true ? 0 : bx::foreach_ns::pointer(_container) )->m_break)
+
+} // namespace bx
+
+#endif // __BX_FOREACH_H__
diff --git a/include/bx/handlealloc.h b/include/bx/handlealloc.h
index fd66642..276eea0 100644
--- a/include/bx/handlealloc.h
+++ b/include/bx/handlealloc.h
@@ -1,88 +1,88 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_HANDLE_ALLOC_H__
-#define __BX_HANDLE_ALLOC_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	class HandleAlloc
-	{
-	public:
-		static const uint16_t invalid = 0xffff;
-
-		HandleAlloc(uint16_t _maxHandles)
-			: m_dense(new uint16_t[_maxHandles*2])
-			, m_sparse(&m_dense[_maxHandles])
-			, m_numHandles(0)
-			, m_maxHandles(_maxHandles)
-		{
-			for (uint16_t ii = 0; ii < _maxHandles; ++ii)
-			{
-				m_dense[ii] = ii;
-			}
-		}
-
-		~HandleAlloc()
-		{
-			delete [] m_dense;
-		}
-
-		const uint16_t* getHandles() const
-		{
-			return m_dense;
-		}
-
-		uint16_t getHandleAt(uint16_t _at) const
-		{
-			return m_dense[_at];
-		}
-
-		uint16_t getNumHandles() const
-		{
-			return m_numHandles;
-		}
-
-		uint16_t getMaxHandles() const
-		{
-			return m_maxHandles;
-		}
-
-		uint16_t alloc()
-		{
-			if (m_numHandles < m_maxHandles)
-			{
-				uint16_t index = m_numHandles;
-				++m_numHandles;
-
-				uint16_t handle = m_dense[index];
-				m_sparse[handle] = index;
-				return handle;
-			}
-
-			return invalid;
-		}
-
-		void free(uint16_t _handle)
-		{
-			uint16_t index = m_sparse[_handle];
-			--m_numHandles;
-			uint16_t temp = m_dense[m_numHandles];
-			m_dense[m_numHandles] = _handle;
-			m_sparse[temp] = index;
-			m_dense[index] = temp;
-		}
-
-	private:
-		uint16_t* m_dense;
-		uint16_t* m_sparse;
-		uint16_t m_numHandles;
-		uint16_t m_maxHandles;
-	};
-} // namespace bx
-
-#endif // __HANDLE_ALLOC_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_HANDLE_ALLOC_H__
+#define __BX_HANDLE_ALLOC_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	class HandleAlloc
+	{
+	public:
+		static const uint16_t invalid = 0xffff;
+
+		HandleAlloc(uint16_t _maxHandles)
+			: m_dense(new uint16_t[_maxHandles*2])
+			, m_sparse(&m_dense[_maxHandles])
+			, m_numHandles(0)
+			, m_maxHandles(_maxHandles)
+		{
+			for (uint16_t ii = 0; ii < _maxHandles; ++ii)
+			{
+				m_dense[ii] = ii;
+			}
+		}
+
+		~HandleAlloc()
+		{
+			delete [] m_dense;
+		}
+
+		const uint16_t* getHandles() const
+		{
+			return m_dense;
+		}
+
+		uint16_t getHandleAt(uint16_t _at) const
+		{
+			return m_dense[_at];
+		}
+
+		uint16_t getNumHandles() const
+		{
+			return m_numHandles;
+		}
+
+		uint16_t getMaxHandles() const
+		{
+			return m_maxHandles;
+		}
+
+		uint16_t alloc()
+		{
+			if (m_numHandles < m_maxHandles)
+			{
+				uint16_t index = m_numHandles;
+				++m_numHandles;
+
+				uint16_t handle = m_dense[index];
+				m_sparse[handle] = index;
+				return handle;
+			}
+
+			return invalid;
+		}
+
+		void free(uint16_t _handle)
+		{
+			uint16_t index = m_sparse[_handle];
+			--m_numHandles;
+			uint16_t temp = m_dense[m_numHandles];
+			m_dense[m_numHandles] = _handle;
+			m_sparse[temp] = index;
+			m_dense[index] = temp;
+		}
+
+	private:
+		uint16_t* m_dense;
+		uint16_t* m_sparse;
+		uint16_t m_numHandles;
+		uint16_t m_maxHandles;
+	};
+} // namespace bx
+
+#endif // __HANDLE_ALLOC_H__
diff --git a/include/bx/maputil.h b/include/bx/maputil.h
index b92c9cc..b6e69ee 100644
--- a/include/bx/maputil.h
+++ b/include/bx/maputil.h
@@ -1,29 +1,29 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_MAPUTIL_H__
-#define __BX_MAPUTIL_H__
-
-#include "bx.h"
-
-namespace bx
-{
-	template<typename MapType>
-	typename MapType::iterator mapInsertOrUpdate(MapType& _map, const typename MapType::key_type& _key, const typename MapType::mapped_type& _value)
-	{
-		typename MapType::iterator it = _map.lower_bound(_key);
-		if (it != _map.end()
-		&&  !_map.key_comp()(_key, it->first) )
-		{
-			it->second = _value;
-			return it;
-		}
-
-		typename MapType::value_type pair(_key, _value);
-		return _map.insert(it, pair);
-	}
-} // namespace bx
-
-#endif // __BX_MAPUTIL_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_MAPUTIL_H__
+#define __BX_MAPUTIL_H__
+
+#include "bx.h"
+
+namespace bx
+{
+	template<typename MapType>
+	typename MapType::iterator mapInsertOrUpdate(MapType& _map, const typename MapType::key_type& _key, const typename MapType::mapped_type& _value)
+	{
+		typename MapType::iterator it = _map.lower_bound(_key);
+		if (it != _map.end()
+		&&  !_map.key_comp()(_key, it->first) )
+		{
+			it->second = _value;
+			return it;
+		}
+
+		typename MapType::value_type pair(_key, _value);
+		return _map.insert(it, pair);
+	}
+} // namespace bx
+
+#endif // __BX_MAPUTIL_H__
diff --git a/include/bx/mutex.h b/include/bx/mutex.h
index b20bb9a..73d55f7 100644
--- a/include/bx/mutex.h
+++ b/include/bx/mutex.h
@@ -1,171 +1,171 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_MUTEX_H__
-#define __BX_MUTEX_H__
-
-#include "bx.h"
-#include "cpu.h"
-#include "sem.h"
-
-#if BX_PLATFORM_NACL || BX_PLATFORM_LINUX || BX_PLATFORM_ANDROID || BX_PLATFORM_OSX
-#	include <pthread.h>
-#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
-#	include <errno.h>
-#endif // BX_PLATFORM_
-
-namespace bx
-{
-#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
-	typedef CRITICAL_SECTION pthread_mutex_t;
-	typedef unsigned pthread_mutexattr_t;
-
-	inline int pthread_mutex_lock(pthread_mutex_t* _mutex)
-	{
-		EnterCriticalSection(_mutex);
-		return 0;
-	}
-
-	inline int pthread_mutex_unlock(pthread_mutex_t* _mutex)
-	{
-		LeaveCriticalSection(_mutex);
-		return 0;
-	}
-
-	inline int pthread_mutex_trylock(pthread_mutex_t* _mutex)
-	{
-		return TryEnterCriticalSection(_mutex) ? 0 : EBUSY;
-	}
-
-	inline int pthread_mutex_init(pthread_mutex_t* _mutex, pthread_mutexattr_t* /*_attr*/)
-	{
-		InitializeCriticalSection(_mutex);
-		return 0;
-	}
-
-	inline int pthread_mutex_destroy(pthread_mutex_t* _mutex)
-	{
-		DeleteCriticalSection(_mutex);
-		return 0;
-	}
-#endif // BX_PLATFORM_
-
-	class Mutex
-	{
-	public:
-		Mutex()
-		{
-			pthread_mutex_init(&m_handle, NULL);
-		}
-
-		~Mutex()
-		{
-			pthread_mutex_destroy(&m_handle);
-		}
-
-		void lock()
-		{
-			pthread_mutex_lock(&m_handle);
-		}
-
-		void unlock()
-		{
-			pthread_mutex_unlock(&m_handle);
-		}
-
-	private:
-		Mutex(const Mutex& _rhs); // no copy constructor
-		Mutex& operator=(const Mutex& _rhs); // no assignment operator
-
-		pthread_mutex_t m_handle;
-	};
-
-	class MutexScope
-	{
-	public:
-		MutexScope(Mutex& _mutex)
-			: m_mutex(_mutex)
-		{
-			m_mutex.lock();
-		}
-
-		~MutexScope()
-		{
-			m_mutex.unlock();
-		}
-
-	private:
-		MutexScope(); // no default constructor
-		MutexScope(const MutexScope& _rhs); // no copy constructor
-		MutexScope& operator=(const MutexScope& _rhs); // no assignment operator
-
-		Mutex& m_mutex;
-	};
-
-#if 1
-	typedef Mutex LwMutex;
-#else
-	class LwMutex
-	{
-	public:
-		LwMutex()
-			: m_count(0)
-		{
-		}
-
-		~LwMutex()
-		{
-		}
-
-		void lock()
-		{
-			if (atomicIncr(&m_count) > 1)
-			{
-				m_sem.wait();
-			}
-		}
-
-		void unlock()
-		{
-			if (atomicDecr(&m_count) > 0)
-			{
-				m_sem.post();
-			}
-		}
-
-	private:
-		LwMutex(const LwMutex& _rhs); // no copy constructor
-		LwMutex& operator=(const LwMutex& _rhs); // no assignment operator
-
-		Semaphore m_sem;
-		volatile int32_t m_count;
-	};
-#endif // 0
-
-	class LwMutexScope
-	{
-	public:
-		LwMutexScope(LwMutex& _mutex)
-			: m_mutex(_mutex)
-		{
-			m_mutex.lock();
-		}
-
-		~LwMutexScope()
-		{
-			m_mutex.unlock();
-		}
-
-	private:
-		LwMutexScope(); // no default constructor
-		LwMutexScope(const LwMutexScope& _rhs); // no copy constructor
-		LwMutexScope& operator=(const LwMutexScope& _rhs); // no assignment operator
-
-		LwMutex& m_mutex;
-	};
-
-} // namespace bx
-
-#endif // __BX_MUTEX_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_MUTEX_H__
+#define __BX_MUTEX_H__
+
+#include "bx.h"
+#include "cpu.h"
+#include "sem.h"
+
+#if BX_PLATFORM_NACL || BX_PLATFORM_LINUX || BX_PLATFORM_ANDROID || BX_PLATFORM_OSX
+#	include <pthread.h>
+#elif BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+#	include <errno.h>
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+	typedef CRITICAL_SECTION pthread_mutex_t;
+	typedef unsigned pthread_mutexattr_t;
+
+	inline int pthread_mutex_lock(pthread_mutex_t* _mutex)
+	{
+		EnterCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_unlock(pthread_mutex_t* _mutex)
+	{
+		LeaveCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_trylock(pthread_mutex_t* _mutex)
+	{
+		return TryEnterCriticalSection(_mutex) ? 0 : EBUSY;
+	}
+
+	inline int pthread_mutex_init(pthread_mutex_t* _mutex, pthread_mutexattr_t* /*_attr*/)
+	{
+		InitializeCriticalSection(_mutex);
+		return 0;
+	}
+
+	inline int pthread_mutex_destroy(pthread_mutex_t* _mutex)
+	{
+		DeleteCriticalSection(_mutex);
+		return 0;
+	}
+#endif // BX_PLATFORM_
+
+	class Mutex
+	{
+	public:
+		Mutex()
+		{
+			pthread_mutex_init(&m_handle, NULL);
+		}
+
+		~Mutex()
+		{
+			pthread_mutex_destroy(&m_handle);
+		}
+
+		void lock()
+		{
+			pthread_mutex_lock(&m_handle);
+		}
+
+		void unlock()
+		{
+			pthread_mutex_unlock(&m_handle);
+		}
+
+	private:
+		Mutex(const Mutex& _rhs); // no copy constructor
+		Mutex& operator=(const Mutex& _rhs); // no assignment operator
+
+		pthread_mutex_t m_handle;
+	};
+
+	class MutexScope
+	{
+	public:
+		MutexScope(Mutex& _mutex)
+			: m_mutex(_mutex)
+		{
+			m_mutex.lock();
+		}
+
+		~MutexScope()
+		{
+			m_mutex.unlock();
+		}
+
+	private:
+		MutexScope(); // no default constructor
+		MutexScope(const MutexScope& _rhs); // no copy constructor
+		MutexScope& operator=(const MutexScope& _rhs); // no assignment operator
+
+		Mutex& m_mutex;
+	};
+
+#if 1
+	typedef Mutex LwMutex;
+#else
+	class LwMutex
+	{
+	public:
+		LwMutex()
+			: m_count(0)
+		{
+		}
+
+		~LwMutex()
+		{
+		}
+
+		void lock()
+		{
+			if (atomicIncr(&m_count) > 1)
+			{
+				m_sem.wait();
+			}
+		}
+
+		void unlock()
+		{
+			if (atomicDecr(&m_count) > 0)
+			{
+				m_sem.post();
+			}
+		}
+
+	private:
+		LwMutex(const LwMutex& _rhs); // no copy constructor
+		LwMutex& operator=(const LwMutex& _rhs); // no assignment operator
+
+		Semaphore m_sem;
+		volatile int32_t m_count;
+	};
+#endif // 0
+
+	class LwMutexScope
+	{
+	public:
+		LwMutexScope(LwMutex& _mutex)
+			: m_mutex(_mutex)
+		{
+			m_mutex.lock();
+		}
+
+		~LwMutexScope()
+		{
+			m_mutex.unlock();
+		}
+
+	private:
+		LwMutexScope(); // no default constructor
+		LwMutexScope(const LwMutexScope& _rhs); // no copy constructor
+		LwMutexScope& operator=(const LwMutexScope& _rhs); // no assignment operator
+
+		LwMutex& m_mutex;
+	};
+
+} // namespace bx
+
+#endif // __BX_MUTEX_H__
diff --git a/include/bx/os.h b/include/bx/os.h
index 9d073b8..447ee94 100644
--- a/include/bx/os.h
+++ b/include/bx/os.h
@@ -1,46 +1,46 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_OS_H__
-#define __BX_OS_H__
-
-#include "bx.h"
-
-#if BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX || BX_PLATFORM_OSX
-#	include <sched.h> // sched_yield
-#	if BX_PLATFORM_NACL
-#		include <sys/nacl_syscalls.h> // nanosleep
-#	else
-#		include <time.h> // nanosleep
-#	endif // BX_PLATFORM_NACL
-#endif // BX_PLATFORM_
-
-namespace bx
-{
-	inline void sleep(uint32_t _ms)
-	{
-#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
-		Sleep(_ms);
-#else
-		timespec req = {(time_t)_ms/1000, (long)((_ms%1000)*1000000)};
-		timespec rem = {0, 0};
-		nanosleep(&req, &rem);
-#endif // BX_PLATFORM_
-	}
-
-	inline void yield()
-	{
-#if BX_PLATFORM_WINDOWS
-		SwitchToThread();
-#elif BX_PLATFORM_XBOX360
-		Sleep(0);
-#else
-		sched_yield();
-#endif // BX_PLATFORM_
-	}
-
-} // namespace bx
-
-#endif // __BX_OS_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_OS_H__
+#define __BX_OS_H__
+
+#include "bx.h"
+
+#if BX_PLATFORM_NACL || BX_PLATFORM_ANDROID || BX_PLATFORM_LINUX || BX_PLATFORM_OSX
+#	include <sched.h> // sched_yield
+#	if BX_PLATFORM_NACL
+#		include <sys/nacl_syscalls.h> // nanosleep
+#	else
+#		include <time.h> // nanosleep
+#	endif // BX_PLATFORM_NACL
+#endif // BX_PLATFORM_
+
+namespace bx
+{
+	inline void sleep(uint32_t _ms)
+	{
+#if BX_PLATFORM_WINDOWS || BX_PLATFORM_XBOX360
+		Sleep(_ms);
+#else
+		timespec req = {(time_t)_ms/1000, (long)((_ms%1000)*1000000)};
+		timespec rem = {0, 0};
+		nanosleep(&req, &rem);
+#endif // BX_PLATFORM_
+	}
+
+	inline void yield()
+	{
+#if BX_PLATFORM_WINDOWS
+		SwitchToThread();
+#elif BX_PLATFORM_XBOX360
+		Sleep(0);
+#else
+		sched_yield();
+#endif // BX_PLATFORM_
+	}
+
+} // namespace bx
+
+#endif // __BX_OS_H__
diff --git a/include/bx/platform.h b/include/bx/platform.h
index b0b0798..50f1834 100644
--- a/include/bx/platform.h
+++ b/include/bx/platform.h
@@ -119,17 +119,17 @@
 
 #if BX_CONFIG_ENABLE_MSVC_LEVEL4_WARNINGS && BX_COMPILER_MSVC
 #	pragma warning(error:4062) // ENABLE warning C4062: enumerator'...' in switch of enum '...' is not handled
-#	pragma warning(error:4121) // ENABLE warning C4121: 'symbol' : alignment of a member was sensitive to packing
-#	pragma warning(error:4130) // ENABLE warning C4130: 'operator' : logical operation on address of string constant
-#	pragma warning(error:4239) // ENABLE warning C4239: nonstandard extension used : 'argument' : conversion from '*' to '* &' A non-const reference may only be bound to an lvalue
-//#	pragma warning(error:4244) // ENABLE warning C4244: 'conversion' conversion from 'type1' to 'type2', possible loss of data
-#	pragma warning(error:4263) // ENABLE warning C4263: 'function' : member function does not override any base class virtual member function
-#	pragma warning(error:4265) // ENABLE warning C4265: class has virtual functions, but destructor is not virtual
-#	pragma warning(error:4431) // ENABLE warning C4431: missing type specifier - int assumed. Note: C no longer supports default-int
-#	pragma warning(error:4545) // ENABLE warning C4545: expression before comma evaluates to a function which is missing an argument list
-#	pragma warning(error:4549) // ENABLE warning C4549: 'operator' : operator before comma has no effect; did you intend 'operator'?
-#	pragma warning(error:4701) // ENABLE warning C4701: potentially uninitialized local variable 'name' used
-#	pragma warning(error:4706) // ENABLE warning C4706: assignment within conditional expression
+#	pragma warning(error:4121) // ENABLE warning C4121: 'symbol' : alignment of a member was sensitive to packing
+#	pragma warning(error:4130) // ENABLE warning C4130: 'operator' : logical operation on address of string constant
+#	pragma warning(error:4239) // ENABLE warning C4239: nonstandard extension used : 'argument' : conversion from '*' to '* &' A non-const reference may only be bound to an lvalue
+//#	pragma warning(error:4244) // ENABLE warning C4244: 'conversion' conversion from 'type1' to 'type2', possible loss of data
+#	pragma warning(error:4263) // ENABLE warning C4263: 'function' : member function does not override any base class virtual member function
+#	pragma warning(error:4265) // ENABLE warning C4265: class has virtual functions, but destructor is not virtual
+#	pragma warning(error:4431) // ENABLE warning C4431: missing type specifier - int assumed. Note: C no longer supports default-int
+#	pragma warning(error:4545) // ENABLE warning C4545: expression before comma evaluates to a function which is missing an argument list
+#	pragma warning(error:4549) // ENABLE warning C4549: 'operator' : operator before comma has no effect; did you intend 'operator'?
+#	pragma warning(error:4701) // ENABLE warning C4701: potentially uninitialized local variable 'name' used
+#	pragma warning(error:4706) // ENABLE warning C4706: assignment within conditional expression
 #endif // BX_CONFIG_ENABLE_MSVC_LEVEL4_WARNINGS && BX_COMPILER_MSVC
 
 #endif // __BX_PLATFORM_H__
diff --git a/include/bx/radixsort.h b/include/bx/radixsort.h
index f5ab322..d77e210 100644
--- a/include/bx/radixsort.h
+++ b/include/bx/radixsort.h
@@ -1,111 +1,111 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-#ifndef __BX_RADIXSORT_H__
-#define __BX_RADIXSORT_H__
-
-#include "bx.h"
-
-namespace bx
-{
-#define BX_RADIXSORT_BITS 11
-#define BX_RADIXSORT_HISTOGRAM_SIZE (1<<BX_RADIXSORT_BITS)
-#define BX_RADIXSORT_BIT_MASK (BX_RADIXSORT_HISTOGRAM_SIZE-1)
-
-	template <typename Ty>
-	void radixSort32(uint32_t* _keys, uint32_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
-	{
-		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
-		uint16_t shift = 0;
-		for (uint32_t pass = 0; pass < 3; ++pass)
-		{
-			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
-			for (uint32_t ii = 0; ii < _size; ++ii)
-			{
-				uint32_t key = _keys[ii];
-				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
-				++histogram[index];
-			}
-
-			uint16_t offset = 0;
-			for (uint32_t ii = 0; ii < BX_RADIXSORT_HISTOGRAM_SIZE; ++ii)
-			{
-				uint16_t count = histogram[ii];
-				histogram[ii] = offset;
-				offset += count;
-			}
-
-			for (uint32_t ii = 0; ii < _size; ++ii)
-			{
-				uint32_t key = _keys[ii];
-				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
-				uint16_t dest = histogram[index]++;
-				_tempKeys[dest] = key;
-				_tempValues[dest] = _values[ii];
-			}
-
-			uint32_t* swapKeys = _tempKeys;
-			_tempKeys = _keys;
-			_keys = swapKeys;
-
-			Ty* swapValues = _tempValues;
-			_tempValues = _values;
-			_values = swapValues;
-
-			shift += BX_RADIXSORT_BITS;
-		}
-	}
-
-	template <typename Ty>
-	void radixSort64(uint64_t* _keys, uint64_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
-	{
-		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
-		uint16_t shift = 0;
-		for (uint32_t pass = 0; pass < 6; ++pass)
-		{
-			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
-			for (uint32_t ii = 0; ii < _size; ++ii)
-			{
-				uint64_t key = _keys[ii];
-				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
-				++histogram[index];
-			}
-
-			uint16_t offset = 0;
-			for (uint32_t ii = 0; ii < BX_RADIXSORT_HISTOGRAM_SIZE; ++ii)
-			{
-				uint16_t count = histogram[ii];
-				histogram[ii] = offset;
-				offset += count;
-			}
-
-			for (uint32_t ii = 0; ii < _size; ++ii)
-			{
-				uint64_t key = _keys[ii];
-				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
-				uint16_t dest = histogram[index]++;
-				_tempKeys[dest] = key;
-				_tempValues[dest] = _values[ii];
-			}
-
-			uint64_t* swapKeys = _tempKeys;
-			_tempKeys = _keys;
-			_keys = swapKeys;
-
-			Ty* swapValues = _tempValues;
-			_tempValues = _values;
-			_values = swapValues;
-
-			shift += BX_RADIXSORT_BITS;
-		}
-	}
-
-#undef BX_RADIXSORT_BITS
-#undef BX_RADIXSORT_HISTOGRAM_SIZE
-#undef BX_RADIXSORT_BIT_MASK
-
-} // namespace bx
-
-#endif // __BX_RADIXSORT_H__
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#ifndef __BX_RADIXSORT_H__
+#define __BX_RADIXSORT_H__
+
+#include "bx.h"
+
+namespace bx
+{
+#define BX_RADIXSORT_BITS 11
+#define BX_RADIXSORT_HISTOGRAM_SIZE (1<<BX_RADIXSORT_BITS)
+#define BX_RADIXSORT_BIT_MASK (BX_RADIXSORT_HISTOGRAM_SIZE-1)
+
+	template <typename Ty>
+	void radixSort32(uint32_t* _keys, uint32_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
+	{
+		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
+		uint16_t shift = 0;
+		for (uint32_t pass = 0; pass < 3; ++pass)
+		{
+			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				uint32_t key = _keys[ii];
+				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
+				++histogram[index];
+			}
+
+			uint16_t offset = 0;
+			for (uint32_t ii = 0; ii < BX_RADIXSORT_HISTOGRAM_SIZE; ++ii)
+			{
+				uint16_t count = histogram[ii];
+				histogram[ii] = offset;
+				offset += count;
+			}
+
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				uint32_t key = _keys[ii];
+				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
+				uint16_t dest = histogram[index]++;
+				_tempKeys[dest] = key;
+				_tempValues[dest] = _values[ii];
+			}
+
+			uint32_t* swapKeys = _tempKeys;
+			_tempKeys = _keys;
+			_keys = swapKeys;
+
+			Ty* swapValues = _tempValues;
+			_tempValues = _values;
+			_values = swapValues;
+
+			shift += BX_RADIXSORT_BITS;
+		}
+	}
+
+	template <typename Ty>
+	void radixSort64(uint64_t* _keys, uint64_t* _tempKeys, Ty* _values, Ty* _tempValues, uint32_t _size)
+	{
+		uint16_t histogram[BX_RADIXSORT_HISTOGRAM_SIZE];
+		uint16_t shift = 0;
+		for (uint32_t pass = 0; pass < 6; ++pass)
+		{
+			memset(histogram, 0, sizeof(uint16_t)*BX_RADIXSORT_HISTOGRAM_SIZE);
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				uint64_t key = _keys[ii];
+				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
+				++histogram[index];
+			}
+
+			uint16_t offset = 0;
+			for (uint32_t ii = 0; ii < BX_RADIXSORT_HISTOGRAM_SIZE; ++ii)
+			{
+				uint16_t count = histogram[ii];
+				histogram[ii] = offset;
+				offset += count;
+			}
+
+			for (uint32_t ii = 0; ii < _size; ++ii)
+			{
+				uint64_t key = _keys[ii];
+				uint16_t index = (key>>shift)&BX_RADIXSORT_BIT_MASK;
+				uint16_t dest = histogram[index]++;
+				_tempKeys[dest] = key;
+				_tempValues[dest] = _values[ii];
+			}
+
+			uint64_t* swapKeys = _tempKeys;
+			_tempKeys = _keys;
+			_keys = swapKeys;
+
+			Ty* swapValues = _tempValues;
+			_tempValues = _values;
+			_values = swapValues;
+
+			shift += BX_RADIXSORT_BITS;
+		}
+	}
+
+#undef BX_RADIXSORT_BITS
+#undef BX_RADIXSORT_HISTOGRAM_SIZE
+#undef BX_RADIXSORT_BIT_MASK
+
+} // namespace bx
+
+#endif // __BX_RADIXSORT_H__
diff --git a/include/bx/readerwriter.h b/include/bx/readerwriter.h
index c343d84..ff01213 100644
--- a/include/bx/readerwriter.h
+++ b/include/bx/readerwriter.h
@@ -1,270 +1,270 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
- 
-#ifndef __BX_READERWRITER_H__
-#define __BX_READERWRITER_H__
-
-#include <stdio.h>
-#include <string.h>
-
-#include "bx.h"
-#include "uint32_t.h"
-
-#if BX_COMPILER_MSVC
-#	define fseeko64 _fseeki64
-#	define ftello64 _ftelli64
-#elif BX_PLATFORM_OSX
-#	define fseeko64 fseeko
-#	define ftello64 ftello
-#endif // BX_
-
-namespace bx
-{
-	struct Whence
-	{
-		enum Enum
-		{
-			Begin,
-			Current,
-			End,
-		};
-	};
-
-	struct BX_NO_VTABLE ReaderI
-	{
-		virtual ~ReaderI() = 0;
-		virtual int32_t read(void* _data, int32_t _size) = 0;
-	};
-
-	inline ReaderI::~ReaderI()
-	{
-	}
-
-	struct BX_NO_VTABLE WriterI
-	{
-		virtual ~WriterI() = 0;
-		virtual int32_t write(const void* _data, int32_t _size) = 0;
-	};
-
-	inline WriterI::~WriterI()
-	{
-	}
-
-	struct BX_NO_VTABLE SeekerI
-	{
-		virtual ~SeekerI() = 0;
-		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) = 0;
-	};
-
-	inline SeekerI::~SeekerI()
-	{
-	}
-
-	inline int32_t read(ReaderI* _reader, void* _data, int32_t _size)
-	{
-		return _reader->read(_data, _size);
-	}
-
-	template<typename Ty>
-	inline int32_t read(ReaderI* _reader, Ty& _value)
-	{
-		return _reader->read(&_value, sizeof(Ty) );
-	}
-
-	inline int32_t write(WriterI* _writer, const void* _data, int32_t _size)
-	{
-		return _writer->write(_data, _size);
-	}
-
-	template<typename Ty>
-	inline int32_t write(WriterI* _writer, const Ty& _value)
-	{
-		return _writer->write(&_value, sizeof(Ty) );
-	}
-
-	inline int64_t skip(SeekerI* _seeker, int64_t _offset)
-	{
-		return _seeker->seek(_offset, Whence::Current);
-	}
-
-	inline int64_t getSize(SeekerI* _seeker)
-	{
-		int64_t offset = _seeker->seek();
-		int64_t size = _seeker->seek(0, Whence::End);
-		_seeker->seek(offset, Whence::Begin);
-		return size;
-	}
-
-	struct BX_NO_VTABLE ReaderSeekerI : public ReaderI, public SeekerI
-	{
-	};
-
-	struct BX_NO_VTABLE WriterSeekerI : public WriterI, public SeekerI
-	{
-	};
-
-	struct BX_NO_VTABLE FileReaderI : public ReaderSeekerI
-	{
-		virtual int32_t open(const char* _filePath) = 0;
-		virtual int32_t close() = 0;
-	};
-
-	struct BX_NO_VTABLE FileWriterI : public WriterSeekerI
-	{
-		virtual int32_t open(const char* _filePath, bool _append = false) = 0;
-		virtual int32_t close() = 0;
-	};
-
-	struct BX_NO_VTABLE MemoryBlockI
-	{
-		virtual void* more(uint32_t _size = 0) = 0;
-		virtual uint32_t getSize() = 0;
-	};
-
-	class StaticMemoryBlock : public MemoryBlockI
-	{
-	public:
-		StaticMemoryBlock(void* _data, uint32_t _size)
-			: m_data(_data)
-			, m_size(_size)
-		{
-		}
-
-		virtual ~StaticMemoryBlock()
-		{
-		}
-
-		virtual void* more(uint32_t /*_size*/ = 0) BX_OVERRIDE
-		{
-			return m_data;
-		}
-
-		virtual uint32_t getSize() BX_OVERRIDE
-		{
-			return m_size;
-		}
-
-	private:
-		void* m_data;
-		uint32_t m_size;
-	};
-
-	inline int64_t int64_min(int64_t _a, int64_t _b)
-	{
-		return _a < _b ? _a : _b;
-	}
-
-	inline int64_t int64_max(int64_t _a, int64_t _b)
-	{
-		return _a > _b ? _a : _b;
-	}
-
-	inline int64_t int64_clamp(int64_t _a, int64_t _min, int64_t _max)
-	{
-		const int64_t min    = int64_min(_a, _max);
-		const int64_t result = int64_max(_min, min);
-
-		return result;
-	}
-
-	class SizerWriter : public WriterSeekerI
-	{
-	public:
-		SizerWriter()
-			: m_pos(0)
-			, m_top(0)
-		{
-		}
-
-		virtual ~SizerWriter()
-		{
-		}
-
-		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
-		{
-			switch (_whence)
-			{
-			case Whence::Begin:
-				m_pos = _offset;
-				break;
-
-			case Whence::Current:
-				m_pos = int64_clamp(m_pos + _offset, 0, m_top);
-				break;
-
-			case Whence::End:
-				m_pos = int64_clamp(m_top - _offset, 0, m_top);
-				break;
-			}
-
-			return m_pos;
-		}
-
-		virtual int32_t write(const void* /*_data*/, int32_t _size) BX_OVERRIDE
-		{
-			int32_t morecore = int32_t(m_pos - m_top) + _size;
-
-			if (0 < morecore)
-			{
-				m_top += morecore;
-			}
-
-			int64_t reminder = m_top-m_pos;
-			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
-			m_pos += size;
-			return size;
-		}
-
-	private:
-		int64_t m_pos;
-		int64_t m_top;
-	};
-
-	class MemoryReader : public ReaderSeekerI
-	{
-	public:
-		MemoryReader(const void* _data, uint32_t _size)
-			: m_data( (const uint8_t*)_data)
-			, m_pos(0)
-			, m_top(_size)
-		{
-		}
-
-		virtual ~MemoryReader()
-		{
-		}
-
-		virtual int64_t seek(int64_t _offset, Whence::Enum _whence) BX_OVERRIDE
-		{
-			switch (_whence)
-			{
-				case Whence::Begin:
-					m_pos = _offset;
-					break;
-
-				case Whence::Current:
-					m_pos = int64_clamp(m_pos + _offset, 0, m_top);
-					break;
-
-				case Whence::End:
-					m_pos = int64_clamp(m_top - _offset, 0, m_top);
-					break;
-			}
-
-			return m_pos;
-		}
-
-		virtual int32_t read(void* _data, int32_t _size) BX_OVERRIDE
-		{
-			int64_t reminder = m_top-m_pos;
-			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
-			memcpy(_data, &m_data[m_pos], size);
-			m_pos += size;
-			return size;
-		}
-
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+ 
+#ifndef __BX_READERWRITER_H__
+#define __BX_READERWRITER_H__
+
+#include <stdio.h>
+#include <string.h>
+
+#include "bx.h"
+#include "uint32_t.h"
+
+#if BX_COMPILER_MSVC
+#	define fseeko64 _fseeki64
+#	define ftello64 _ftelli64
+#elif BX_PLATFORM_OSX
+#	define fseeko64 fseeko
+#	define ftello64 ftello
+#endif // BX_
+
+namespace bx
+{
+	struct Whence
+	{
+		enum Enum
+		{
+			Begin,
+			Current,
+			End,
+		};
+	};
+
+	struct BX_NO_VTABLE ReaderI
+	{
+		virtual ~ReaderI() = 0;
+		virtual int32_t read(void* _data, int32_t _size) = 0;
+	};
+
+	inline ReaderI::~ReaderI()
+	{
+	}
+
+	struct BX_NO_VTABLE WriterI
+	{
+		virtual ~WriterI() = 0;
+		virtual int32_t write(const void* _data, int32_t _size) = 0;
+	};
+
+	inline WriterI::~WriterI()
+	{
+	}
+
+	struct BX_NO_VTABLE SeekerI
+	{
+		virtual ~SeekerI() = 0;
+		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) = 0;
+	};
+
+	inline SeekerI::~SeekerI()
+	{
+	}
+
+	inline int32_t read(ReaderI* _reader, void* _data, int32_t _size)
+	{
+		return _reader->read(_data, _size);
+	}
+
+	template<typename Ty>
+	inline int32_t read(ReaderI* _reader, Ty& _value)
+	{
+		return _reader->read(&_value, sizeof(Ty) );
+	}
+
+	inline int32_t write(WriterI* _writer, const void* _data, int32_t _size)
+	{
+		return _writer->write(_data, _size);
+	}
+
+	template<typename Ty>
+	inline int32_t write(WriterI* _writer, const Ty& _value)
+	{
+		return _writer->write(&_value, sizeof(Ty) );
+	}
+
+	inline int64_t skip(SeekerI* _seeker, int64_t _offset)
+	{
+		return _seeker->seek(_offset, Whence::Current);
+	}
+
+	inline int64_t getSize(SeekerI* _seeker)
+	{
+		int64_t offset = _seeker->seek();
+		int64_t size = _seeker->seek(0, Whence::End);
+		_seeker->seek(offset, Whence::Begin);
+		return size;
+	}
+
+	struct BX_NO_VTABLE ReaderSeekerI : public ReaderI, public SeekerI
+	{
+	};
+
+	struct BX_NO_VTABLE WriterSeekerI : public WriterI, public SeekerI
+	{
+	};
+
+	struct BX_NO_VTABLE FileReaderI : public ReaderSeekerI
+	{
+		virtual int32_t open(const char* _filePath) = 0;
+		virtual int32_t close() = 0;
+	};
+
+	struct BX_NO_VTABLE FileWriterI : public WriterSeekerI
+	{
+		virtual int32_t open(const char* _filePath, bool _append = false) = 0;
+		virtual int32_t close() = 0;
+	};
+
+	struct BX_NO_VTABLE MemoryBlockI
+	{
+		virtual void* more(uint32_t _size = 0) = 0;
+		virtual uint32_t getSize() = 0;
+	};
+
+	class StaticMemoryBlock : public MemoryBlockI
+	{
+	public:
+		StaticMemoryBlock(void* _data, uint32_t _size)
+			: m_data(_data)
+			, m_size(_size)
+		{
+		}
+
+		virtual ~StaticMemoryBlock()
+		{
+		}
+
+		virtual void* more(uint32_t /*_size*/ = 0) BX_OVERRIDE
+		{
+			return m_data;
+		}
+
+		virtual uint32_t getSize() BX_OVERRIDE
+		{
+			return m_size;
+		}
+
+	private:
+		void* m_data;
+		uint32_t m_size;
+	};
+
+	inline int64_t int64_min(int64_t _a, int64_t _b)
+	{
+		return _a < _b ? _a : _b;
+	}
+
+	inline int64_t int64_max(int64_t _a, int64_t _b)
+	{
+		return _a > _b ? _a : _b;
+	}
+
+	inline int64_t int64_clamp(int64_t _a, int64_t _min, int64_t _max)
+	{
+		const int64_t min    = int64_min(_a, _max);
+		const int64_t result = int64_max(_min, min);
+
+		return result;
+	}
+
+	class SizerWriter : public WriterSeekerI
+	{
+	public:
+		SizerWriter()
+			: m_pos(0)
+			, m_top(0)
+		{
+		}
+
+		virtual ~SizerWriter()
+		{
+		}
+
+		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
+		{
+			switch (_whence)
+			{
+			case Whence::Begin:
+				m_pos = _offset;
+				break;
+
+			case Whence::Current:
+				m_pos = int64_clamp(m_pos + _offset, 0, m_top);
+				break;
+
+			case Whence::End:
+				m_pos = int64_clamp(m_top - _offset, 0, m_top);
+				break;
+			}
+
+			return m_pos;
+		}
+
+		virtual int32_t write(const void* /*_data*/, int32_t _size) BX_OVERRIDE
+		{
+			int32_t morecore = int32_t(m_pos - m_top) + _size;
+
+			if (0 < morecore)
+			{
+				m_top += morecore;
+			}
+
+			int64_t reminder = m_top-m_pos;
+			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
+			m_pos += size;
+			return size;
+		}
+
+	private:
+		int64_t m_pos;
+		int64_t m_top;
+	};
+
+	class MemoryReader : public ReaderSeekerI
+	{
+	public:
+		MemoryReader(const void* _data, uint32_t _size)
+			: m_data( (const uint8_t*)_data)
+			, m_pos(0)
+			, m_top(_size)
+		{
+		}
+
+		virtual ~MemoryReader()
+		{
+		}
+
+		virtual int64_t seek(int64_t _offset, Whence::Enum _whence) BX_OVERRIDE
+		{
+			switch (_whence)
+			{
+				case Whence::Begin:
+					m_pos = _offset;
+					break;
+
+				case Whence::Current:
+					m_pos = int64_clamp(m_pos + _offset, 0, m_top);
+					break;
+
+				case Whence::End:
+					m_pos = int64_clamp(m_top - _offset, 0, m_top);
+					break;
+			}
+
+			return m_pos;
+		}
+
+		virtual int32_t read(void* _data, int32_t _size) BX_OVERRIDE
+		{
+			int64_t reminder = m_top-m_pos;
+			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
+			memcpy(_data, &m_data[m_pos], size);
+			m_pos += size;
+			return size;
+		}
+
 		const uint8_t* getDataPtr() const
 		{
 			return &m_data[m_pos];
@@ -280,180 +280,180 @@ namespace bx
 			return m_top-m_pos;
 		}
 
-	private:
-		const uint8_t* m_data;
-		int64_t m_pos;
-		int64_t m_top;
-	};
-
-	class MemoryWriter : public WriterSeekerI
-	{
-	public:
-		MemoryWriter(MemoryBlockI* _memBlock)
-			: m_memBlock(_memBlock)
-			, m_data(NULL)
-			, m_pos(0)
-			, m_top(0)
-			, m_size(0)
-		{
-		}
-
-		virtual ~MemoryWriter()
-		{
-		}
-
-		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
-		{
-			switch (_whence)
-			{
-				case Whence::Begin:
-					m_pos = _offset;
-					break;
-
-				case Whence::Current:
-					m_pos = int64_clamp(m_pos + _offset, 0, m_top);
-					break;
-
-				case Whence::End:
-					m_pos = int64_clamp(m_top - _offset, 0, m_top);
-					break;
-			}
-
-			return m_pos;
-		}
-
-		virtual int32_t write(const void* _data, int32_t _size) BX_OVERRIDE
-		{
-			int32_t morecore = int32_t(m_pos - m_size) + _size;
-
-			if (0 < morecore)
-			{
-				morecore = BX_ALIGN_MASK(morecore, 0xfff);
-				m_data = (uint8_t*)m_memBlock->more(morecore);
-				m_size = m_memBlock->getSize();
-			}
-
-			int64_t reminder = m_size-m_pos;
-			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
-			memcpy(&m_data[m_pos], _data, size);
-			m_pos += size;
-			m_top = int64_max(m_top, m_pos);
-			return size;
-		}
-
-	private:
-		MemoryBlockI* m_memBlock;
-		uint8_t* m_data;
-		int64_t m_pos;
-		int64_t m_top;
-		int64_t m_size;
-	};
-
-	class StaticMemoryBlockWriter : public MemoryWriter
-	{
-	public:
-		StaticMemoryBlockWriter(void* _data, uint32_t _size)
-			: MemoryWriter(&m_smb)
-			, m_smb(_data, _size)
-		{
-		}
-
-		~StaticMemoryBlockWriter()
-		{
-		}
-
-	private:
-		StaticMemoryBlock m_smb;
-	};
-
-#if BX_CONFIG_CRT_FILE_READER_WRITER
-	class CrtFileReader : public FileReaderI
-	{
-	public:
-		CrtFileReader()
-			: m_file(NULL)
-		{
-		}
-
-		virtual ~CrtFileReader()
-		{
-		}
-
-		virtual int32_t open(const char* _filePath) BX_OVERRIDE
-		{
-			m_file = fopen(_filePath, "rb");
-			return NULL == m_file;
-		}
-
-		virtual int32_t close() BX_OVERRIDE
-		{
-			fclose(m_file);
-			return 0;
-		}
-
-		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
-		{
-			fseeko64(m_file, _offset, _whence);
-			return ftello64(m_file);
-		}
-
-		virtual int32_t read(void* _data, int32_t _size) BX_OVERRIDE
-		{
-			return (int32_t)fread(_data, 1, _size, m_file);
-		}
-
-	private:
-		FILE* m_file;
-	};
-
-	class CrtFileWriter : public FileWriterI
-	{
-	public:
-		CrtFileWriter()
-			: m_file(NULL)
-		{
-		}
-
-		virtual ~CrtFileWriter()
-		{
-		}
-
-		virtual int32_t open(const char* _filePath, bool _append = false) BX_OVERRIDE
-		{
-			if (_append)
-			{
-				m_file = fopen(_filePath, "ab");
-			}
-			else
-			{
-				m_file = fopen(_filePath, "wb");
-			}
-
-			return NULL == m_file;
-		}
-
-		virtual int32_t close() BX_OVERRIDE
-		{
-			fclose(m_file);
-			return 0;
-		}
-
-		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
-		{
-			fseeko64(m_file, _offset, _whence);
-			return ftello64(m_file);
-		}
-
-		virtual int32_t write(const void* _data, int32_t _size) BX_OVERRIDE
-		{
-			return (int32_t)fwrite(_data, 1, _size, m_file);
-		}
-
-	private:
-		FILE* m_file;
-	};
-#endif // BX_CONFIG_CRT_FILE_READER_WRITER
-
-} // namespace bx
-
-#endif // __BX_READERWRITER_H__
+	private:
+		const uint8_t* m_data;
+		int64_t m_pos;
+		int64_t m_top;
+	};
+
+	class MemoryWriter : public WriterSeekerI
+	{
+	public:
+		MemoryWriter(MemoryBlockI* _memBlock)
+			: m_memBlock(_memBlock)
+			, m_data(NULL)
+			, m_pos(0)
+			, m_top(0)
+			, m_size(0)
+		{
+		}
+
+		virtual ~MemoryWriter()
+		{
+		}
+
+		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
+		{
+			switch (_whence)
+			{
+				case Whence::Begin:
+					m_pos = _offset;
+					break;
+
+				case Whence::Current:
+					m_pos = int64_clamp(m_pos + _offset, 0, m_top);
+					break;
+
+				case Whence::End:
+					m_pos = int64_clamp(m_top - _offset, 0, m_top);
+					break;
+			}
+
+			return m_pos;
+		}
+
+		virtual int32_t write(const void* _data, int32_t _size) BX_OVERRIDE
+		{
+			int32_t morecore = int32_t(m_pos - m_size) + _size;
+
+			if (0 < morecore)
+			{
+				morecore = BX_ALIGN_MASK(morecore, 0xfff);
+				m_data = (uint8_t*)m_memBlock->more(morecore);
+				m_size = m_memBlock->getSize();
+			}
+
+			int64_t reminder = m_size-m_pos;
+			int32_t size = uint32_min(_size, int32_t(reminder > INT32_MAX ? INT32_MAX : reminder) );
+			memcpy(&m_data[m_pos], _data, size);
+			m_pos += size;
+			m_top = int64_max(m_top, m_pos);
+			return size;
+		}
+
+	private:
+		MemoryBlockI* m_memBlock;
+		uint8_t* m_data;
+		int64_t m_pos;
+		int64_t m_top;
+		int64_t m_size;
+	};
+
+	class StaticMemoryBlockWriter : public MemoryWriter
+	{
+	public:
+		StaticMemoryBlockWriter(void* _data, uint32_t _size)
+			: MemoryWriter(&m_smb)
+			, m_smb(_data, _size)
+		{
+		}
+
+		~StaticMemoryBlockWriter()
+		{
+		}
+
+	private:
+		StaticMemoryBlock m_smb;
+	};
+
+#if BX_CONFIG_CRT_FILE_READER_WRITER
+	class CrtFileReader : public FileReaderI
+	{
+	public:
+		CrtFileReader()
+			: m_file(NULL)
+		{
+		}
+
+		virtual ~CrtFileReader()
+		{
+		}
+
+		virtual int32_t open(const char* _filePath) BX_OVERRIDE
+		{
+			m_file = fopen(_filePath, "rb");
+			return NULL == m_file;
+		}
+
+		virtual int32_t close() BX_OVERRIDE
+		{
+			fclose(m_file);
+			return 0;
+		}
+
+		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
+		{
+			fseeko64(m_file, _offset, _whence);
+			return ftello64(m_file);
+		}
+
+		virtual int32_t read(void* _data, int32_t _size) BX_OVERRIDE
+		{
+			return (int32_t)fread(_data, 1, _size, m_file);
+		}
+
+	private:
+		FILE* m_file;
+	};
+
+	class CrtFileWriter : public FileWriterI
+	{
+	public:
+		CrtFileWriter()
+			: m_file(NULL)
+		{
+		}
+
+		virtual ~CrtFileWriter()
+		{
+		}
+
+		virtual int32_t open(const char* _filePath, bool _append = false) BX_OVERRIDE
+		{
+			if (_append)
+			{
+				m_file = fopen(_filePath, "ab");
+			}
+			else
+			{
+				m_file = fopen(_filePath, "wb");
+			}
+
+			return NULL == m_file;
+		}
+
+		virtual int32_t close() BX_OVERRIDE
+		{
+			fclose(m_file);
+			return 0;
+		}
+
+		virtual int64_t seek(int64_t _offset = 0, Whence::Enum _whence = Whence::Current) BX_OVERRIDE
+		{
+			fseeko64(m_file, _offset, _whence);
+			return ftello64(m_file);
+		}
+
+		virtual int32_t write(const void* _data, int32_t _size) BX_OVERRIDE
+		{
+			return (int32_t)fwrite(_data, 1, _size, m_file);
+		}
+
+	private:
+		FILE* m_file;
+	};
+#endif // BX_CONFIG_CRT_FILE_READER_WRITER
+
+} // namespace bx
+
+#endif // __BX_READERWRITER_H__
diff --git a/include/bx/uint32_t.h b/include/bx/uint32_t.h
index 2bcab80..1b13a72 100644
--- a/include/bx/uint32_t.h
+++ b/include/bx/uint32_t.h
@@ -1,455 +1,455 @@
-/*
- * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
- * License: http://www.opensource.org/licenses/BSD-2-Clause
- */
-
-// Copyright 2006 Mike Acton <macton@gmail.com>
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included
-// in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE SOFTWARE
-
-#ifndef __BX_UINT32_T_H__
-#define __BX_UINT32_T_H__
-
-#include "bx.h"
-
-#if BX_COMPILER_MSVC
-#	if BX_PLATFORM_WINDOWS
-#		include <math.h> // math.h is included because VS bitches:
-						 // warning C4985: 'ceil': attributes not present on previous declaration.
-						 // must be included before intrin.h.
-#		include <intrin.h>
-#		pragma intrinsic(_BitScanForward)
-#		pragma intrinsic(_BitScanReverse)
-#	endif // BX_PLATFORM_WINDOWS
-#endif // BX_COMPILER_MSVC
-
-namespace bx
-{
-	inline uint32_t uint32_li(uint32_t _a)
-	{
-		return _a;
-	}
-
-	inline uint32_t uint32_dec(uint32_t _a)
-	{
-		return _a - 1;
-	}
-
-	inline uint32_t uint32_inc(uint32_t _a)
-	{
-		return _a + 1;
-	}
-
-	inline uint32_t uint32_not(uint32_t _a)
-	{
-		return ~_a;
-	}
-
-	inline uint32_t uint32_neg(uint32_t _a)
-	{
-		return -(int32_t)_a;
-	}
-
-	inline uint32_t uint32_ext(uint32_t _a)
-	{
-		return ( (int32_t)_a)>>31;
-	}
-
-	inline uint32_t uint32_and(uint32_t _a, uint32_t _b)
-	{
-		return _a & _b;
-	}
-
-	inline uint32_t uint32_xor(uint32_t _a, uint32_t _b)
-	{
-		return _a ^ _b;
-	}
-
-	inline uint32_t uint32_xorl(uint32_t _a, uint32_t _b)
-	{
-		return !_a != !_b;
-	}
-
-	inline uint32_t uint32_andc(uint32_t _a, uint32_t _b)
-	{
-		return _a & ~_b;
-	}
-
-	inline uint32_t uint32_or(uint32_t _a, uint32_t _b)
-	{
-		return _a | _b;
-	}
-
-	inline uint32_t uint32_sll(uint32_t _a, int _sa)
-	{
-		return _a << _sa;
-	}
-
-	inline uint32_t uint32_srl(uint32_t _a, int _sa)
-	{
-		return _a >> _sa;
-	}
-
-	inline uint32_t uint32_sra(uint32_t _a, int _sa)
-	{
-		return ( (int32_t)_a) >> _sa;
-	}
-
-	inline uint32_t uint32_rol(uint32_t _a, int _sa)
-	{
-		return ( _a << _sa) | (_a >> (32-_sa) );
-	}
-
-	inline uint32_t uint32_ror(uint32_t _a, int _sa)
-	{
-		return ( _a >> _sa) | (_a << (32-_sa) );
-	}
-
-	inline uint32_t uint32_add(uint32_t _a, uint32_t _b)
-	{
-		return _a + _b;
-	}
-
-	inline uint32_t uint32_sub(uint32_t _a, uint32_t _b)
-	{
-		return _a - _b;
-	}
-
-	inline uint32_t uint32_mul(uint32_t _a, uint32_t _b)
-	{
-		return _a * _b;
-	}
-
-	inline uint32_t uint32_div(uint32_t _a, uint32_t _b)
-	{
-		return (_a / _b);
-	}
-
-	inline uint32_t uint32_mod(uint32_t _a, uint32_t _b)
-	{
-		return (_a % _b);
-	}
-
-	inline uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b)
-	{
-		return -(_a == _b);
-	}
-
-	inline uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b)
-	{
-		return -(_a != _b);
-	}
-
-	inline uint32_t uint32_cmplt(uint32_t _a, uint32_t _b)
-	{
-		return -(_a < _b);
-	}
-
-	inline uint32_t uint32_cmple(uint32_t _a, uint32_t _b)
-	{
-		return -(_a <= _b);
-	}
-
-	inline uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b)
-	{
-		return -(_a > _b);
-	}
-
-	inline uint32_t uint32_cmpge(uint32_t _a, uint32_t _b)
-	{
-		return -(_a >= _b);
-	}
-
-	inline uint32_t uint32_setnz(uint32_t _a)
-	{
-		return -!!_a;
-	}
-
-	inline uint32_t uint32_satadd(uint32_t _a, uint32_t _b)
-	{
-		const uint32_t add    = uint32_add(_a, _b);
-		const uint32_t lt     = uint32_cmplt(add, _a);
-		const uint32_t result = uint32_or(add, lt);
-
-		return result;
-	}
-
-	inline uint32_t uint32_satsub(uint32_t _a, uint32_t _b)
-	{
-		const uint32_t sub    = uint32_sub(_a, _b);
-		const uint32_t le     = uint32_cmple(sub, _a);
-		const uint32_t result = uint32_and(sub, le);
-
-		return result;
-	}
-
-	inline uint32_t uint32_satmul(uint32_t _a, uint32_t _b)
-	{
-		const uint64_t mul    = (uint64_t)_a * (uint64_t)_b;
-		const uint32_t hi     = mul >> 32;
-		const uint32_t nz     = uint32_setnz(hi);
-		const uint32_t result = uint32_or(uint32_t(mul), nz);
-
-		return result;
-	}
-
-	inline uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b)
-	{
-		const uint32_t mask   = uint32_ext(test);
-		const uint32_t sel_a  = uint32_and(_a, mask);
-		const uint32_t sel_b  = uint32_andc(_b, mask);
-		const uint32_t result = uint32_or(sel_a, sel_b);
-
-		return (result);
-	}
-
-	inline uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b)
-	{
-		const uint32_t sel_a  = uint32_and(_a, _mask);
-		const uint32_t sel_b  = uint32_andc(_b, _mask);
-		const uint32_t result = uint32_or(sel_a, sel_b);
-
-		return (result);
-	}
-
-	inline uint32_t uint32_imin(uint32_t _a, uint32_t _b)
-	{
-		const uint32_t a_sub_b = uint32_sub(_a, _b);
-		const uint32_t result  = uint32_sels(a_sub_b, _a, _b);
-
-		return result;
-	}
-
-	inline uint32_t uint32_imax(uint32_t _a, uint32_t _b)
-	{
-		const uint32_t b_sub_a = uint32_sub(_b, _a);
-		const uint32_t result  = uint32_sels(b_sub_a, _a, _b);
-
-		return result;
-	}
-
-	inline uint32_t uint32_min(uint32_t _a, uint32_t _b)
-	{
-		return _a > _b ? _b : _a;
-	}
-
-	inline uint32_t uint32_max(uint32_t _a, uint32_t _b)
-	{
-		return _a > _b ? _a : _b;
-	}
-
-	inline uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max)
-	{
-		const uint32_t inc          = uint32_inc(_val);
-		const uint32_t max_diff     = uint32_sub(_max, _val);
-		const uint32_t neg_max_diff = uint32_neg(max_diff);
-		const uint32_t max_or       = uint32_or(max_diff, neg_max_diff);
-		const uint32_t max_diff_nz  = uint32_ext(max_or);
-		const uint32_t result       = uint32_selb(max_diff_nz, inc, _min);
-
-		return result;
-	}
-
-	inline uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max)
-	{
-		const uint32_t dec          = uint32_dec(_val);
-		const uint32_t min_diff     = uint32_sub(_min, _val);
-		const uint32_t neg_min_diff = uint32_neg(min_diff);
-		const uint32_t min_or       = uint32_or(min_diff, neg_min_diff);
-		const uint32_t min_diff_nz  = uint32_ext(min_or);
-		const uint32_t result       = uint32_selb(min_diff_nz, dec, _max);
-
-		return result;
-	}
-
-	inline uint32_t uint32_cntbits_ref(uint32_t _val)
-	{
-		const uint32_t tmp0   = uint32_srl(_val, 1);
-		const uint32_t tmp1   = uint32_and(tmp0, 0x55555555);
-		const uint32_t tmp2   = uint32_sub(_val, tmp1);
-		const uint32_t tmp3   = uint32_and(tmp2, 0xc30c30c3);
-		const uint32_t tmp4   = uint32_srl(tmp2, 2);
-		const uint32_t tmp5   = uint32_and(tmp4, 0xc30c30c3);
-		const uint32_t tmp6   = uint32_srl(tmp2, 4);
-		const uint32_t tmp7   = uint32_and(tmp6, 0xc30c30c3);
-		const uint32_t tmp8   = uint32_add(tmp3, tmp5);
-		const uint32_t tmp9   = uint32_add(tmp7, tmp8);
-		const uint32_t tmpA   = uint32_srl(tmp9, 6);
-		const uint32_t tmpB   = uint32_add(tmp9, tmpA);
-		const uint32_t tmpC   = uint32_srl(tmpB, 12);
-		const uint32_t tmpD   = uint32_srl(tmpB, 24);
-		const uint32_t tmpE   = uint32_add(tmpB, tmpC);
-		const uint32_t tmpF   = uint32_add(tmpD, tmpE);
-		const uint32_t result = uint32_and(tmpF, 0x3f);
-
-		return result;
-	}
-
-	/// Count number of bits set.
-	inline uint32_t uint32_cntbits(uint32_t _val)
-	{
-#if BX_COMPILER_GCC
-		return __builtin_popcount(_val);
-#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
-		return __popcnt(_val);
-#else
-		return uint32_cntbits_ref(_val);
-#endif // BX_COMPILER_GCC
-	}
-
-	inline uint32_t uint32_cntlz_ref(uint32_t _val)
-	{
-		const uint32_t tmp0   = uint32_srl(_val, 1);
-		const uint32_t tmp1   = uint32_or(tmp0, _val);
-		const uint32_t tmp2   = uint32_srl(tmp1, 2);
-		const uint32_t tmp3   = uint32_or(tmp2, tmp1);
-		const uint32_t tmp4   = uint32_srl(tmp3, 4);
-		const uint32_t tmp5   = uint32_or(tmp4, tmp3);
-		const uint32_t tmp6   = uint32_srl(tmp5, 8);
-		const uint32_t tmp7   = uint32_or(tmp6, tmp5);
-		const uint32_t tmp8   = uint32_srl(tmp7, 16);
-		const uint32_t tmp9   = uint32_or(tmp8, tmp7);
-		const uint32_t tmpA   = uint32_not(tmp9);
-		const uint32_t result = uint32_cntbits(tmpA);
-
-		return result;
-	}
-
-	/// Count number of leading zeros.
-	inline uint32_t uint32_cntlz(uint32_t _val)
-	{
-#if BX_COMPILER_GCC
-		return __builtin_clz(_val);
-#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
-		unsigned long index;
-		_BitScanReverse(&index, _val);
-		return 31 - index;
-#else
-		return uint32_cntlz_ref(_val);
-#endif // BX_COMPILER_
-	}
-
-	inline uint32_t uint32_cnttz_ref(uint32_t _val)
-	{
-		const uint32_t tmp0   = uint32_not(_val);
-		const uint32_t tmp1   = uint32_dec(_val);
-		const uint32_t tmp2   = uint32_and(tmp0, tmp1);
-		const uint32_t result = uint32_cntbits(tmp2);
-
-		return result;
-	}
-
-	inline uint32_t uint32_cnttz(uint32_t _val)
-	{
-#if BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
-		unsigned long index;
-		_BitScanForward(&index, _val);
-		return index;
-#else
-		return uint32_cnttz_ref(_val);
-#endif // BX_COMPILER_
-	}
-
-	// shuffle:
-	// ---- ---- ---- ---- fedc ba98 7654 3210
-	// to:
-	// -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
-	inline uint32_t uint32_part1by1(uint32_t _a)
-	{
-		const uint32_t val    = uint32_and(_a, 0xffff);
-
-		const uint32_t tmp0   = uint32_sll(val, 8);
-		const uint32_t tmp1   = uint32_xor(val, tmp0);
-		const uint32_t tmp2   = uint32_and(tmp1, 0x00ff00ff);
-
-		const uint32_t tmp3   = uint32_sll(tmp2, 4);
-		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
-		const uint32_t tmp5   = uint32_and(tmp4, 0x0f0f0f0f);
-
-		const uint32_t tmp6   = uint32_sll(tmp5, 2);
-		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
-		const uint32_t tmp8   = uint32_and(tmp7, 0x33333333);
-
-		const uint32_t tmp9   = uint32_sll(tmp8, 1);
-		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
-		const uint32_t result = uint32_and(tmpA, 0x55555555);
-
-		return result;
-	}
-
-	// shuffle:
-	// ---- ---- ---- ---- ---- --98 7654 3210
-	// to:
-	// ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
-	inline uint32_t uint32_part1by2(uint32_t _a)
-	{
-		const uint32_t val    = uint32_and(_a, 0x3ff);
-
-		const uint32_t tmp0   = uint32_sll(val, 16);
-		const uint32_t tmp1   = uint32_xor(val, tmp0);
-		const uint32_t tmp2   = uint32_and(tmp1, 0xff0000ff);
-
-		const uint32_t tmp3   = uint32_sll(tmp2, 8);
-		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
-		const uint32_t tmp5   = uint32_and(tmp4, 0x0300f00f);
-
-		const uint32_t tmp6   = uint32_sll(tmp5, 4);
-		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
-		const uint32_t tmp8   = uint32_and(tmp7, 0x030c30c3);
-
-		const uint32_t tmp9   = uint32_sll(tmp8, 2);
-		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
-		const uint32_t result = uint32_and(tmpA, 0x09249249);
-
-		return result;
-	}
-
-	inline uint32_t uint32_testpow2(uint32_t _a)
-	{
-		const uint32_t tmp0   = uint32_not(_a);
-		const uint32_t tmp1   = uint32_inc(tmp0);
-		const uint32_t tmp2   = uint32_and(_a, tmp1);
-		const uint32_t tmp3   = uint32_cmpeq(tmp2, _a);
-		const uint32_t tmp4   = uint32_cmpneq(_a, 0);
-		const uint32_t result = uint32_and(tmp3, tmp4);
-
-		return result;
-	}
-
-	inline uint32_t uint32_nextpow2(uint32_t _a)
-	{
-		const uint32_t tmp0   = uint32_dec(_a);
-		const uint32_t tmp1   = uint32_srl(tmp0, 1);
-		const uint32_t tmp2   = uint32_or(tmp0, tmp1);
-		const uint32_t tmp3   = uint32_srl(tmp2, 2);
-		const uint32_t tmp4   = uint32_or(tmp2, tmp3);
-		const uint32_t tmp5   = uint32_srl(tmp4, 4);
-		const uint32_t tmp6   = uint32_or(tmp4, tmp5);
-		const uint32_t tmp7   = uint32_srl(tmp6, 8);
-		const uint32_t tmp8   = uint32_or(tmp6, tmp7);
-		const uint32_t tmp9   = uint32_srl(tmp8, 16);
-		const uint32_t tmpA   = uint32_or(tmp8, tmp9);
-		const uint32_t result = uint32_inc(tmpA);
-
-		return result;
-	}
-
+/*
+ * Copyright 2010-2012 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+// Copyright 2006 Mike Acton <macton@gmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE
+
+#ifndef __BX_UINT32_T_H__
+#define __BX_UINT32_T_H__
+
+#include "bx.h"
+
+#if BX_COMPILER_MSVC
+#	if BX_PLATFORM_WINDOWS
+#		include <math.h> // math.h is included because VS bitches:
+						 // warning C4985: 'ceil': attributes not present on previous declaration.
+						 // must be included before intrin.h.
+#		include <intrin.h>
+#		pragma intrinsic(_BitScanForward)
+#		pragma intrinsic(_BitScanReverse)
+#	endif // BX_PLATFORM_WINDOWS
+#endif // BX_COMPILER_MSVC
+
+namespace bx
+{
+	inline uint32_t uint32_li(uint32_t _a)
+	{
+		return _a;
+	}
+
+	inline uint32_t uint32_dec(uint32_t _a)
+	{
+		return _a - 1;
+	}
+
+	inline uint32_t uint32_inc(uint32_t _a)
+	{
+		return _a + 1;
+	}
+
+	inline uint32_t uint32_not(uint32_t _a)
+	{
+		return ~_a;
+	}
+
+	inline uint32_t uint32_neg(uint32_t _a)
+	{
+		return -(int32_t)_a;
+	}
+
+	inline uint32_t uint32_ext(uint32_t _a)
+	{
+		return ( (int32_t)_a)>>31;
+	}
+
+	inline uint32_t uint32_and(uint32_t _a, uint32_t _b)
+	{
+		return _a & _b;
+	}
+
+	inline uint32_t uint32_xor(uint32_t _a, uint32_t _b)
+	{
+		return _a ^ _b;
+	}
+
+	inline uint32_t uint32_xorl(uint32_t _a, uint32_t _b)
+	{
+		return !_a != !_b;
+	}
+
+	inline uint32_t uint32_andc(uint32_t _a, uint32_t _b)
+	{
+		return _a & ~_b;
+	}
+
+	inline uint32_t uint32_or(uint32_t _a, uint32_t _b)
+	{
+		return _a | _b;
+	}
+
+	inline uint32_t uint32_sll(uint32_t _a, int _sa)
+	{
+		return _a << _sa;
+	}
+
+	inline uint32_t uint32_srl(uint32_t _a, int _sa)
+	{
+		return _a >> _sa;
+	}
+
+	inline uint32_t uint32_sra(uint32_t _a, int _sa)
+	{
+		return ( (int32_t)_a) >> _sa;
+	}
+
+	inline uint32_t uint32_rol(uint32_t _a, int _sa)
+	{
+		return ( _a << _sa) | (_a >> (32-_sa) );
+	}
+
+	inline uint32_t uint32_ror(uint32_t _a, int _sa)
+	{
+		return ( _a >> _sa) | (_a << (32-_sa) );
+	}
+
+	inline uint32_t uint32_add(uint32_t _a, uint32_t _b)
+	{
+		return _a + _b;
+	}
+
+	inline uint32_t uint32_sub(uint32_t _a, uint32_t _b)
+	{
+		return _a - _b;
+	}
+
+	inline uint32_t uint32_mul(uint32_t _a, uint32_t _b)
+	{
+		return _a * _b;
+	}
+
+	inline uint32_t uint32_div(uint32_t _a, uint32_t _b)
+	{
+		return (_a / _b);
+	}
+
+	inline uint32_t uint32_mod(uint32_t _a, uint32_t _b)
+	{
+		return (_a % _b);
+	}
+
+	inline uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b)
+	{
+		return -(_a == _b);
+	}
+
+	inline uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b)
+	{
+		return -(_a != _b);
+	}
+
+	inline uint32_t uint32_cmplt(uint32_t _a, uint32_t _b)
+	{
+		return -(_a < _b);
+	}
+
+	inline uint32_t uint32_cmple(uint32_t _a, uint32_t _b)
+	{
+		return -(_a <= _b);
+	}
+
+	inline uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b)
+	{
+		return -(_a > _b);
+	}
+
+	inline uint32_t uint32_cmpge(uint32_t _a, uint32_t _b)
+	{
+		return -(_a >= _b);
+	}
+
+	inline uint32_t uint32_setnz(uint32_t _a)
+	{
+		return -!!_a;
+	}
+
+	inline uint32_t uint32_satadd(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t add    = uint32_add(_a, _b);
+		const uint32_t lt     = uint32_cmplt(add, _a);
+		const uint32_t result = uint32_or(add, lt);
+
+		return result;
+	}
+
+	inline uint32_t uint32_satsub(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t sub    = uint32_sub(_a, _b);
+		const uint32_t le     = uint32_cmple(sub, _a);
+		const uint32_t result = uint32_and(sub, le);
+
+		return result;
+	}
+
+	inline uint32_t uint32_satmul(uint32_t _a, uint32_t _b)
+	{
+		const uint64_t mul    = (uint64_t)_a * (uint64_t)_b;
+		const uint32_t hi     = mul >> 32;
+		const uint32_t nz     = uint32_setnz(hi);
+		const uint32_t result = uint32_or(uint32_t(mul), nz);
+
+		return result;
+	}
+
+	inline uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b)
+	{
+		const uint32_t mask   = uint32_ext(test);
+		const uint32_t sel_a  = uint32_and(_a, mask);
+		const uint32_t sel_b  = uint32_andc(_b, mask);
+		const uint32_t result = uint32_or(sel_a, sel_b);
+
+		return (result);
+	}
+
+	inline uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b)
+	{
+		const uint32_t sel_a  = uint32_and(_a, _mask);
+		const uint32_t sel_b  = uint32_andc(_b, _mask);
+		const uint32_t result = uint32_or(sel_a, sel_b);
+
+		return (result);
+	}
+
+	inline uint32_t uint32_imin(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t a_sub_b = uint32_sub(_a, _b);
+		const uint32_t result  = uint32_sels(a_sub_b, _a, _b);
+
+		return result;
+	}
+
+	inline uint32_t uint32_imax(uint32_t _a, uint32_t _b)
+	{
+		const uint32_t b_sub_a = uint32_sub(_b, _a);
+		const uint32_t result  = uint32_sels(b_sub_a, _a, _b);
+
+		return result;
+	}
+
+	inline uint32_t uint32_min(uint32_t _a, uint32_t _b)
+	{
+		return _a > _b ? _b : _a;
+	}
+
+	inline uint32_t uint32_max(uint32_t _a, uint32_t _b)
+	{
+		return _a > _b ? _a : _b;
+	}
+
+	inline uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max)
+	{
+		const uint32_t inc          = uint32_inc(_val);
+		const uint32_t max_diff     = uint32_sub(_max, _val);
+		const uint32_t neg_max_diff = uint32_neg(max_diff);
+		const uint32_t max_or       = uint32_or(max_diff, neg_max_diff);
+		const uint32_t max_diff_nz  = uint32_ext(max_or);
+		const uint32_t result       = uint32_selb(max_diff_nz, inc, _min);
+
+		return result;
+	}
+
+	inline uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max)
+	{
+		const uint32_t dec          = uint32_dec(_val);
+		const uint32_t min_diff     = uint32_sub(_min, _val);
+		const uint32_t neg_min_diff = uint32_neg(min_diff);
+		const uint32_t min_or       = uint32_or(min_diff, neg_min_diff);
+		const uint32_t min_diff_nz  = uint32_ext(min_or);
+		const uint32_t result       = uint32_selb(min_diff_nz, dec, _max);
+
+		return result;
+	}
+
+	inline uint32_t uint32_cntbits_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_srl(_val, 1);
+		const uint32_t tmp1   = uint32_and(tmp0, 0x55555555);
+		const uint32_t tmp2   = uint32_sub(_val, tmp1);
+		const uint32_t tmp3   = uint32_and(tmp2, 0xc30c30c3);
+		const uint32_t tmp4   = uint32_srl(tmp2, 2);
+		const uint32_t tmp5   = uint32_and(tmp4, 0xc30c30c3);
+		const uint32_t tmp6   = uint32_srl(tmp2, 4);
+		const uint32_t tmp7   = uint32_and(tmp6, 0xc30c30c3);
+		const uint32_t tmp8   = uint32_add(tmp3, tmp5);
+		const uint32_t tmp9   = uint32_add(tmp7, tmp8);
+		const uint32_t tmpA   = uint32_srl(tmp9, 6);
+		const uint32_t tmpB   = uint32_add(tmp9, tmpA);
+		const uint32_t tmpC   = uint32_srl(tmpB, 12);
+		const uint32_t tmpD   = uint32_srl(tmpB, 24);
+		const uint32_t tmpE   = uint32_add(tmpB, tmpC);
+		const uint32_t tmpF   = uint32_add(tmpD, tmpE);
+		const uint32_t result = uint32_and(tmpF, 0x3f);
+
+		return result;
+	}
+
+	/// Count number of bits set.
+	inline uint32_t uint32_cntbits(uint32_t _val)
+	{
+#if BX_COMPILER_GCC
+		return __builtin_popcount(_val);
+#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		return __popcnt(_val);
+#else
+		return uint32_cntbits_ref(_val);
+#endif // BX_COMPILER_GCC
+	}
+
+	inline uint32_t uint32_cntlz_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_srl(_val, 1);
+		const uint32_t tmp1   = uint32_or(tmp0, _val);
+		const uint32_t tmp2   = uint32_srl(tmp1, 2);
+		const uint32_t tmp3   = uint32_or(tmp2, tmp1);
+		const uint32_t tmp4   = uint32_srl(tmp3, 4);
+		const uint32_t tmp5   = uint32_or(tmp4, tmp3);
+		const uint32_t tmp6   = uint32_srl(tmp5, 8);
+		const uint32_t tmp7   = uint32_or(tmp6, tmp5);
+		const uint32_t tmp8   = uint32_srl(tmp7, 16);
+		const uint32_t tmp9   = uint32_or(tmp8, tmp7);
+		const uint32_t tmpA   = uint32_not(tmp9);
+		const uint32_t result = uint32_cntbits(tmpA);
+
+		return result;
+	}
+
+	/// Count number of leading zeros.
+	inline uint32_t uint32_cntlz(uint32_t _val)
+	{
+#if BX_COMPILER_GCC
+		return __builtin_clz(_val);
+#elif BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		unsigned long index;
+		_BitScanReverse(&index, _val);
+		return 31 - index;
+#else
+		return uint32_cntlz_ref(_val);
+#endif // BX_COMPILER_
+	}
+
+	inline uint32_t uint32_cnttz_ref(uint32_t _val)
+	{
+		const uint32_t tmp0   = uint32_not(_val);
+		const uint32_t tmp1   = uint32_dec(_val);
+		const uint32_t tmp2   = uint32_and(tmp0, tmp1);
+		const uint32_t result = uint32_cntbits(tmp2);
+
+		return result;
+	}
+
+	inline uint32_t uint32_cnttz(uint32_t _val)
+	{
+#if BX_COMPILER_MSVC && BX_PLATFORM_WINDOWS
+		unsigned long index;
+		_BitScanForward(&index, _val);
+		return index;
+#else
+		return uint32_cnttz_ref(_val);
+#endif // BX_COMPILER_
+	}
+
+	// shuffle:
+	// ---- ---- ---- ---- fedc ba98 7654 3210
+	// to:
+	// -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	inline uint32_t uint32_part1by1(uint32_t _a)
+	{
+		const uint32_t val    = uint32_and(_a, 0xffff);
+
+		const uint32_t tmp0   = uint32_sll(val, 8);
+		const uint32_t tmp1   = uint32_xor(val, tmp0);
+		const uint32_t tmp2   = uint32_and(tmp1, 0x00ff00ff);
+
+		const uint32_t tmp3   = uint32_sll(tmp2, 4);
+		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_and(tmp4, 0x0f0f0f0f);
+
+		const uint32_t tmp6   = uint32_sll(tmp5, 2);
+		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
+		const uint32_t tmp8   = uint32_and(tmp7, 0x33333333);
+
+		const uint32_t tmp9   = uint32_sll(tmp8, 1);
+		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
+		const uint32_t result = uint32_and(tmpA, 0x55555555);
+
+		return result;
+	}
+
+	// shuffle:
+	// ---- ---- ---- ---- ---- --98 7654 3210
+	// to:
+	// ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	inline uint32_t uint32_part1by2(uint32_t _a)
+	{
+		const uint32_t val    = uint32_and(_a, 0x3ff);
+
+		const uint32_t tmp0   = uint32_sll(val, 16);
+		const uint32_t tmp1   = uint32_xor(val, tmp0);
+		const uint32_t tmp2   = uint32_and(tmp1, 0xff0000ff);
+
+		const uint32_t tmp3   = uint32_sll(tmp2, 8);
+		const uint32_t tmp4   = uint32_xor(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_and(tmp4, 0x0300f00f);
+
+		const uint32_t tmp6   = uint32_sll(tmp5, 4);
+		const uint32_t tmp7   = uint32_xor(tmp5, tmp6);
+		const uint32_t tmp8   = uint32_and(tmp7, 0x030c30c3);
+
+		const uint32_t tmp9   = uint32_sll(tmp8, 2);
+		const uint32_t tmpA   = uint32_xor(tmp8, tmp9);
+		const uint32_t result = uint32_and(tmpA, 0x09249249);
+
+		return result;
+	}
+
+	inline uint32_t uint32_testpow2(uint32_t _a)
+	{
+		const uint32_t tmp0   = uint32_not(_a);
+		const uint32_t tmp1   = uint32_inc(tmp0);
+		const uint32_t tmp2   = uint32_and(_a, tmp1);
+		const uint32_t tmp3   = uint32_cmpeq(tmp2, _a);
+		const uint32_t tmp4   = uint32_cmpneq(_a, 0);
+		const uint32_t result = uint32_and(tmp3, tmp4);
+
+		return result;
+	}
+
+	inline uint32_t uint32_nextpow2(uint32_t _a)
+	{
+		const uint32_t tmp0   = uint32_dec(_a);
+		const uint32_t tmp1   = uint32_srl(tmp0, 1);
+		const uint32_t tmp2   = uint32_or(tmp0, tmp1);
+		const uint32_t tmp3   = uint32_srl(tmp2, 2);
+		const uint32_t tmp4   = uint32_or(tmp2, tmp3);
+		const uint32_t tmp5   = uint32_srl(tmp4, 4);
+		const uint32_t tmp6   = uint32_or(tmp4, tmp5);
+		const uint32_t tmp7   = uint32_srl(tmp6, 8);
+		const uint32_t tmp8   = uint32_or(tmp6, tmp7);
+		const uint32_t tmp9   = uint32_srl(tmp8, 16);
+		const uint32_t tmpA   = uint32_or(tmp8, tmp9);
+		const uint32_t result = uint32_inc(tmpA);
+
+		return result;
+	}
+
 	inline uint16_t halfFromFloat(float _a)
 	{
 		union { uint32_t ui; float flt;	} ftou;
@@ -564,8 +564,8 @@ namespace bx
 		union { uint32_t ui; float flt;	} utof;
 		utof.ui = f_result;
 		return utof.flt;
-	} 
-
-} // namespace bx
-
-#endif // __BX_UINT32_T_H__
+	} 
+
+} // namespace bx
+
+#endif // __BX_UINT32_T_H__
diff --git a/include/compat/mingw/alloca.h b/include/compat/mingw/alloca.h
index 2da04de..196379c 100644
--- a/include/compat/mingw/alloca.h
+++ b/include/compat/mingw/alloca.h
@@ -1,6 +1,6 @@
-#ifndef __MINGW32__ALLOCA_H__
-#define __MINGW32__ALLOCA_H__
-
-#include <malloc.h>
-
-#endif // __MINGW32__ALLOCA_H__
+#ifndef __MINGW32__ALLOCA_H__
+#define __MINGW32__ALLOCA_H__
+
+#include <malloc.h>
+
+#endif // __MINGW32__ALLOCA_H__
diff --git a/include/compat/mingw/sal.h b/include/compat/mingw/sal.h
index 26e4aec..a2165e8 100644
--- a/include/compat/mingw/sal.h
+++ b/include/compat/mingw/sal.h
@@ -1,253 +1,253 @@
-#pragma once
-
-#if __GNUC__ >=3
-#pragma GCC system_header
-#endif
-
-//#define __null // << Conflicts with GCC internal type __null
-#define __notnull
-#define __maybenull
-#define __readonly
-#define __notreadonly
-#define __maybereadonly
-#define __valid
-#define __notvalid
-#define __maybevalid
-#define __readableTo(extent)
-#define __elem_readableTo(size)
-#define __byte_readableTo(size)
-#define __writableTo(size)
-#define __elem_writableTo(size)
-#define __byte_writableTo(size)
-#define __deref
-#define __pre
-#define __post
-#define __precond(expr)
-#define __postcond(expr)
-#define __exceptthat
-#define __execeptthat
-#define __inner_success(expr)
-#define __inner_checkReturn
-#define __inner_typefix(ctype)
-#define __inner_override
-#define __inner_callback
-#define __inner_blocksOn(resource)
-#define __inner_fallthrough_dec
-#define __inner_fallthrough
-#define __refparam
-#define __inner_control_entrypoint(category)
-#define __inner_data_entrypoint(category)
-
-#define __ecount(size)
-#define __bcount(size)
-#define __in
-#define __in_ecount(size)
-#define __in_bcount(size)
-#define __in_z
-#define __in_ecount_z(size)
-#define __in_bcount_z(size)
-#define __in_nz
-#define __in_ecount_nz(size)
-#define __in_bcount_nz(size)
-#define __in_xcount_opt(size)
-#define __out
-#define __out_ecount(size)
-#define __out_bcount(size)
-#define __out_ecount_part(size,length)
-#define __out_bcount_part(size,length)
-#define __out_ecount_full(size)
-#define __out_bcount_full(size)
-#define __out_z
-#define __out_z_opt
-#define __out_ecount_z(size)
-#define __out_bcount_z(size)
-#define __out_ecount_part_z(size,length)
-#define __out_bcount_part_z(size,length)
-#define __out_ecount_full_z(size)
-#define __out_bcount_full_z(size)
-#define __out_nz
-#define __out_nz_opt
-#define __out_ecount_nz(size)
-#define __out_bcount_nz(size)
-#define __inout
-#define __inout_ecount(size)
-#define __inout_bcount(size)
-#define __inout_ecount_part(size,length)
-#define __inout_bcount_part(size,length)
-#define __inout_ecount_full(size)
-#define __inout_bcount_full(size)
-#define __inout_z
-#define __inout_ecount_z(size)
-#define __inout_bcount_z(size)
-#define __inout_nz
-#define __inout_ecount_nz(size)
-#define __inout_bcount_nz(size)
-#define __ecount_opt(size)
-#define __bcount_opt(size)
-#define __in_opt
-#define __in_ecount_opt(size)
-#define __in_bcount_opt(size)
-#define __in_z_opt
-#define __in_ecount_z_opt(size)
-#define __in_bcount_z_opt(size)
-#define __in_nz_opt
-#define __in_ecount_nz_opt(size)
-#define __in_bcount_nz_opt(size)
-#define __out_opt
-#define __out_ecount_opt(size)
-#define __out_bcount_opt(size)
-#define __out_ecount_part_opt(size,length)
-#define __out_bcount_part_opt(size,length)
-#define __out_ecount_full_opt(size)
-#define __out_bcount_full_opt(size)
-#define __out_ecount_z_opt(size)
-#define __out_bcount_z_opt(size)
-#define __out_ecount_part_z_opt(size,length)
-#define __out_bcount_part_z_opt(size,length)
-#define __out_ecount_full_z_opt(size)
-#define __out_bcount_full_z_opt(size)
-#define __out_ecount_nz_opt(size)
-#define __out_bcount_nz_opt(size)
-#define __inout_opt
-#define __inout_ecount_opt(size)
-#define __inout_bcount_opt(size)
-#define __inout_ecount_part_opt(size,length)
-#define __inout_bcount_part_opt(size,length)
-#define __inout_ecount_full_opt(size)
-#define __inout_bcount_full_opt(size)
-#define __inout_z_opt
-#define __inout_ecount_z_opt(size)
-#define __inout_ecount_z_opt(size)
-#define __inout_bcount_z_opt(size)
-#define __inout_nz_opt
-#define __inout_ecount_nz_opt(size)
-#define __inout_bcount_nz_opt(size)
-#define __deref_ecount(size)
-#define __deref_bcount(size)
-#define __deref_out
-#define __deref_out_ecount(size)
-#define __deref_out_bcount(size)
-#define __deref_out_ecount_part(size,length)
-#define __deref_out_bcount_part(size,length)
-#define __deref_out_ecount_full(size)
-#define __deref_out_bcount_full(size)
-#define __deref_out_z
-#define __deref_out_ecount_z(size)
-#define __deref_out_bcount_z(size)
-#define __deref_out_nz
-#define __deref_out_ecount_nz(size)
-#define __deref_out_bcount_nz(size)
-#define __deref_inout
-#define __deref_inout_z
-#define __deref_inout_ecount(size)
-#define __deref_inout_bcount(size)
-#define __deref_inout_ecount_part(size,length)
-#define __deref_inout_bcount_part(size,length)
-#define __deref_inout_ecount_full(size)
-#define __deref_inout_bcount_full(size)
-#define __deref_inout_z
-#define __deref_inout_ecount_z(size)
-#define __deref_inout_bcount_z(size)
-#define __deref_inout_nz
-#define __deref_inout_ecount_nz(size)
-#define __deref_inout_bcount_nz(size)
-#define __deref_ecount_opt(size)
-#define __deref_bcount_opt(size)
-#define __deref_out_opt
-#define __deref_out_ecount_opt(size)
-#define __deref_out_bcount_opt(size)
-#define __deref_out_ecount_part_opt(size,length)
-#define __deref_out_bcount_part_opt(size,length)
-#define __deref_out_ecount_full_opt(size)
-#define __deref_out_bcount_full_opt(size)
-#define __deref_out_z_opt
-#define __deref_out_ecount_z_opt(size)
-#define __deref_out_bcount_z_opt(size)
-#define __deref_out_nz_opt
-#define __deref_out_ecount_nz_opt(size)
-#define __deref_out_bcount_nz_opt(size)
-#define __deref_inout_opt
-#define __deref_inout_ecount_opt(size)
-#define __deref_inout_bcount_opt(size)
-#define __deref_inout_ecount_part_opt(size,length)
-#define __deref_inout_bcount_part_opt(size,length)
-#define __deref_inout_ecount_full_opt(size)
-#define __deref_inout_bcount_full_opt(size)
-#define __deref_inout_z_opt
-#define __deref_inout_ecount_z_opt(size)
-#define __deref_inout_bcount_z_opt(size)
-#define __deref_inout_nz_opt
-#define __deref_inout_ecount_nz_opt(size)
-#define __deref_inout_bcount_nz_opt(size)
-#define __deref_opt_ecount(size)
-#define __deref_opt_bcount(size)
-#define __deref_opt_out
-#define __deref_opt_out_z
-#define __deref_opt_out_ecount(size)
-#define __deref_opt_out_bcount(size)
-#define __deref_opt_out_ecount_part(size,length)
-#define __deref_opt_out_bcount_part(size,length)
-#define __deref_opt_out_ecount_full(size)
-#define __deref_opt_out_bcount_full(size)
-#define __deref_opt_inout
-#define __deref_opt_inout_ecount(size)
-#define __deref_opt_inout_bcount(size)
-#define __deref_opt_inout_ecount_part(size,length)
-#define __deref_opt_inout_bcount_part(size,length)
-#define __deref_opt_inout_ecount_full(size)
-#define __deref_opt_inout_bcount_full(size)
-#define __deref_opt_inout_z
-#define __deref_opt_inout_ecount_z(size)
-#define __deref_opt_inout_bcount_z(size)
-#define __deref_opt_inout_nz
-#define __deref_opt_inout_ecount_nz(size)
-#define __deref_opt_inout_bcount_nz(size)
-#define __deref_opt_ecount_opt(size)
-#define __deref_opt_bcount_opt(size)
-#define __deref_opt_out_opt
-#define __deref_opt_out_ecount_opt(size)
-#define __deref_opt_out_bcount_opt(size)
-#define __deref_opt_out_ecount_part_opt(size,length)
-#define __deref_opt_out_bcount_part_opt(size,length)
-#define __deref_opt_out_ecount_full_opt(size)
-#define __deref_opt_out_bcount_full_opt(size)
-#define __deref_opt_out_z_opt
-#define __deref_opt_out_ecount_z_opt(size)
-#define __deref_opt_out_bcount_z_opt(size)
-#define __deref_opt_out_nz_opt
-#define __deref_opt_out_ecount_nz_opt(size)
-#define __deref_opt_out_bcount_nz_opt(size)
-#define __deref_opt_inout_opt
-#define __deref_opt_inout_ecount_opt(size)
-#define __deref_opt_inout_bcount_opt(size)
-#define __deref_opt_inout_ecount_part_opt(size,length)
-#define __deref_opt_inout_bcount_part_opt(size,length)
-#define __deref_opt_inout_ecount_full_opt(size)
-#define __deref_opt_inout_bcount_full_opt(size)
-#define __deref_opt_inout_z_opt
-#define __deref_opt_inout_ecount_z_opt(size)
-#define __deref_opt_inout_bcount_z_opt(size)
-#define __deref_opt_inout_nz_opt
-#define __deref_opt_inout_ecount_nz_opt(size)
-#define __deref_opt_inout_bcount_nz_opt(size)
-
-#define __success(expr)
-#define __nullterminated
-#define __nullnullterminated
-#define __reserved
-#define __checkReturn
-#define __typefix(ctype)
-#define __override
-#define __callback
-#define __format_string
-#define __blocksOn(resource)
-#define __control_entrypoint(category)
-#define __data_entrypoint(category)
-
-#ifndef __fallthrough
-    #define __fallthrough __inner_fallthrough
-#endif
-
-#ifndef __analysis_assume
-    #define __analysis_assume(expr)
-#endif
+#pragma once
+
+#if __GNUC__ >=3
+#pragma GCC system_header
+#endif
+
+//#define __null // << Conflicts with GCC internal type __null
+#define __notnull
+#define __maybenull
+#define __readonly
+#define __notreadonly
+#define __maybereadonly
+#define __valid
+#define __notvalid
+#define __maybevalid
+#define __readableTo(extent)
+#define __elem_readableTo(size)
+#define __byte_readableTo(size)
+#define __writableTo(size)
+#define __elem_writableTo(size)
+#define __byte_writableTo(size)
+#define __deref
+#define __pre
+#define __post
+#define __precond(expr)
+#define __postcond(expr)
+#define __exceptthat
+#define __execeptthat
+#define __inner_success(expr)
+#define __inner_checkReturn
+#define __inner_typefix(ctype)
+#define __inner_override
+#define __inner_callback
+#define __inner_blocksOn(resource)
+#define __inner_fallthrough_dec
+#define __inner_fallthrough
+#define __refparam
+#define __inner_control_entrypoint(category)
+#define __inner_data_entrypoint(category)
+
+#define __ecount(size)
+#define __bcount(size)
+#define __in
+#define __in_ecount(size)
+#define __in_bcount(size)
+#define __in_z
+#define __in_ecount_z(size)
+#define __in_bcount_z(size)
+#define __in_nz
+#define __in_ecount_nz(size)
+#define __in_bcount_nz(size)
+#define __in_xcount_opt(size)
+#define __out
+#define __out_ecount(size)
+#define __out_bcount(size)
+#define __out_ecount_part(size,length)
+#define __out_bcount_part(size,length)
+#define __out_ecount_full(size)
+#define __out_bcount_full(size)
+#define __out_z
+#define __out_z_opt
+#define __out_ecount_z(size)
+#define __out_bcount_z(size)
+#define __out_ecount_part_z(size,length)
+#define __out_bcount_part_z(size,length)
+#define __out_ecount_full_z(size)
+#define __out_bcount_full_z(size)
+#define __out_nz
+#define __out_nz_opt
+#define __out_ecount_nz(size)
+#define __out_bcount_nz(size)
+#define __inout
+#define __inout_ecount(size)
+#define __inout_bcount(size)
+#define __inout_ecount_part(size,length)
+#define __inout_bcount_part(size,length)
+#define __inout_ecount_full(size)
+#define __inout_bcount_full(size)
+#define __inout_z
+#define __inout_ecount_z(size)
+#define __inout_bcount_z(size)
+#define __inout_nz
+#define __inout_ecount_nz(size)
+#define __inout_bcount_nz(size)
+#define __ecount_opt(size)
+#define __bcount_opt(size)
+#define __in_opt
+#define __in_ecount_opt(size)
+#define __in_bcount_opt(size)
+#define __in_z_opt
+#define __in_ecount_z_opt(size)
+#define __in_bcount_z_opt(size)
+#define __in_nz_opt
+#define __in_ecount_nz_opt(size)
+#define __in_bcount_nz_opt(size)
+#define __out_opt
+#define __out_ecount_opt(size)
+#define __out_bcount_opt(size)
+#define __out_ecount_part_opt(size,length)
+#define __out_bcount_part_opt(size,length)
+#define __out_ecount_full_opt(size)
+#define __out_bcount_full_opt(size)
+#define __out_ecount_z_opt(size)
+#define __out_bcount_z_opt(size)
+#define __out_ecount_part_z_opt(size,length)
+#define __out_bcount_part_z_opt(size,length)
+#define __out_ecount_full_z_opt(size)
+#define __out_bcount_full_z_opt(size)
+#define __out_ecount_nz_opt(size)
+#define __out_bcount_nz_opt(size)
+#define __inout_opt
+#define __inout_ecount_opt(size)
+#define __inout_bcount_opt(size)
+#define __inout_ecount_part_opt(size,length)
+#define __inout_bcount_part_opt(size,length)
+#define __inout_ecount_full_opt(size)
+#define __inout_bcount_full_opt(size)
+#define __inout_z_opt
+#define __inout_ecount_z_opt(size)
+#define __inout_ecount_z_opt(size)
+#define __inout_bcount_z_opt(size)
+#define __inout_nz_opt
+#define __inout_ecount_nz_opt(size)
+#define __inout_bcount_nz_opt(size)
+#define __deref_ecount(size)
+#define __deref_bcount(size)
+#define __deref_out
+#define __deref_out_ecount(size)
+#define __deref_out_bcount(size)
+#define __deref_out_ecount_part(size,length)
+#define __deref_out_bcount_part(size,length)
+#define __deref_out_ecount_full(size)
+#define __deref_out_bcount_full(size)
+#define __deref_out_z
+#define __deref_out_ecount_z(size)
+#define __deref_out_bcount_z(size)
+#define __deref_out_nz
+#define __deref_out_ecount_nz(size)
+#define __deref_out_bcount_nz(size)
+#define __deref_inout
+#define __deref_inout_z
+#define __deref_inout_ecount(size)
+#define __deref_inout_bcount(size)
+#define __deref_inout_ecount_part(size,length)
+#define __deref_inout_bcount_part(size,length)
+#define __deref_inout_ecount_full(size)
+#define __deref_inout_bcount_full(size)
+#define __deref_inout_z
+#define __deref_inout_ecount_z(size)
+#define __deref_inout_bcount_z(size)
+#define __deref_inout_nz
+#define __deref_inout_ecount_nz(size)
+#define __deref_inout_bcount_nz(size)
+#define __deref_ecount_opt(size)
+#define __deref_bcount_opt(size)
+#define __deref_out_opt
+#define __deref_out_ecount_opt(size)
+#define __deref_out_bcount_opt(size)
+#define __deref_out_ecount_part_opt(size,length)
+#define __deref_out_bcount_part_opt(size,length)
+#define __deref_out_ecount_full_opt(size)
+#define __deref_out_bcount_full_opt(size)
+#define __deref_out_z_opt
+#define __deref_out_ecount_z_opt(size)
+#define __deref_out_bcount_z_opt(size)
+#define __deref_out_nz_opt
+#define __deref_out_ecount_nz_opt(size)
+#define __deref_out_bcount_nz_opt(size)
+#define __deref_inout_opt
+#define __deref_inout_ecount_opt(size)
+#define __deref_inout_bcount_opt(size)
+#define __deref_inout_ecount_part_opt(size,length)
+#define __deref_inout_bcount_part_opt(size,length)
+#define __deref_inout_ecount_full_opt(size)
+#define __deref_inout_bcount_full_opt(size)
+#define __deref_inout_z_opt
+#define __deref_inout_ecount_z_opt(size)
+#define __deref_inout_bcount_z_opt(size)
+#define __deref_inout_nz_opt
+#define __deref_inout_ecount_nz_opt(size)
+#define __deref_inout_bcount_nz_opt(size)
+#define __deref_opt_ecount(size)
+#define __deref_opt_bcount(size)
+#define __deref_opt_out
+#define __deref_opt_out_z
+#define __deref_opt_out_ecount(size)
+#define __deref_opt_out_bcount(size)
+#define __deref_opt_out_ecount_part(size,length)
+#define __deref_opt_out_bcount_part(size,length)
+#define __deref_opt_out_ecount_full(size)
+#define __deref_opt_out_bcount_full(size)
+#define __deref_opt_inout
+#define __deref_opt_inout_ecount(size)
+#define __deref_opt_inout_bcount(size)
+#define __deref_opt_inout_ecount_part(size,length)
+#define __deref_opt_inout_bcount_part(size,length)
+#define __deref_opt_inout_ecount_full(size)
+#define __deref_opt_inout_bcount_full(size)
+#define __deref_opt_inout_z
+#define __deref_opt_inout_ecount_z(size)
+#define __deref_opt_inout_bcount_z(size)
+#define __deref_opt_inout_nz
+#define __deref_opt_inout_ecount_nz(size)
+#define __deref_opt_inout_bcount_nz(size)
+#define __deref_opt_ecount_opt(size)
+#define __deref_opt_bcount_opt(size)
+#define __deref_opt_out_opt
+#define __deref_opt_out_ecount_opt(size)
+#define __deref_opt_out_bcount_opt(size)
+#define __deref_opt_out_ecount_part_opt(size,length)
+#define __deref_opt_out_bcount_part_opt(size,length)
+#define __deref_opt_out_ecount_full_opt(size)
+#define __deref_opt_out_bcount_full_opt(size)
+#define __deref_opt_out_z_opt
+#define __deref_opt_out_ecount_z_opt(size)
+#define __deref_opt_out_bcount_z_opt(size)
+#define __deref_opt_out_nz_opt
+#define __deref_opt_out_ecount_nz_opt(size)
+#define __deref_opt_out_bcount_nz_opt(size)
+#define __deref_opt_inout_opt
+#define __deref_opt_inout_ecount_opt(size)
+#define __deref_opt_inout_bcount_opt(size)
+#define __deref_opt_inout_ecount_part_opt(size,length)
+#define __deref_opt_inout_bcount_part_opt(size,length)
+#define __deref_opt_inout_ecount_full_opt(size)
+#define __deref_opt_inout_bcount_full_opt(size)
+#define __deref_opt_inout_z_opt
+#define __deref_opt_inout_ecount_z_opt(size)
+#define __deref_opt_inout_bcount_z_opt(size)
+#define __deref_opt_inout_nz_opt
+#define __deref_opt_inout_ecount_nz_opt(size)
+#define __deref_opt_inout_bcount_nz_opt(size)
+
+#define __success(expr)
+#define __nullterminated
+#define __nullnullterminated
+#define __reserved
+#define __checkReturn
+#define __typefix(ctype)
+#define __override
+#define __callback
+#define __format_string
+#define __blocksOn(resource)
+#define __control_entrypoint(category)
+#define __data_entrypoint(category)
+
+#ifndef __fallthrough
+    #define __fallthrough __inner_fallthrough
+#endif
+
+#ifndef __analysis_assume
+    #define __analysis_assume(expr)
+#endif
diff --git a/include/compat/mingw/specstrings_strict.h b/include/compat/mingw/specstrings_strict.h
index 982dd20..bb2b90c 100644
--- a/include/compat/mingw/specstrings_strict.h
+++ b/include/compat/mingw/specstrings_strict.h
@@ -1 +1 @@
-#define __reserved
+#define __reserved
diff --git a/include/compat/mingw/specstrings_undef.h b/include/compat/mingw/specstrings_undef.h
index 69d4315..82ed3f7 100644
--- a/include/compat/mingw/specstrings_undef.h
+++ b/include/compat/mingw/specstrings_undef.h
@@ -1,2 +1,2 @@
-#undef __reserved
-
+#undef __reserved
+
diff --git a/include/compat/msvc/alloca.h b/include/compat/msvc/alloca.h
index f8fa6f1..c0d7985 100644
--- a/include/compat/msvc/alloca.h
+++ b/include/compat/msvc/alloca.h
@@ -1 +1 @@
-#include <malloc.h>
+#include <malloc.h>
diff --git a/include/compat/msvc/inttypes.h b/include/compat/msvc/inttypes.h
index 2554277..4b3828a 100644
--- a/include/compat/msvc/inttypes.h
+++ b/include/compat/msvc/inttypes.h
@@ -1,305 +1,305 @@
-// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_INTTYPES_H_ // [
-#define _MSC_INTTYPES_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include "stdint.h"
-
-// 7.8 Format conversion of integer types
-
-typedef struct {
-   intmax_t quot;
-   intmax_t rem;
-} imaxdiv_t;
-
-// 7.8.1 Macros for format specifiers
-
-#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
-
-// The fprintf macros for signed integers are:
-#define PRId8       "d"
-#define PRIi8       "i"
-#define PRIdLEAST8  "d"
-#define PRIiLEAST8  "i"
-#define PRIdFAST8   "d"
-#define PRIiFAST8   "i"
-
-#define PRId16       "hd"
-#define PRIi16       "hi"
-#define PRIdLEAST16  "hd"
-#define PRIiLEAST16  "hi"
-#define PRIdFAST16   "hd"
-#define PRIiFAST16   "hi"
-
-#define PRId32       "I32d"
-#define PRIi32       "I32i"
-#define PRIdLEAST32  "I32d"
-#define PRIiLEAST32  "I32i"
-#define PRIdFAST32   "I32d"
-#define PRIiFAST32   "I32i"
-
-#define PRId64       "I64d"
-#define PRIi64       "I64i"
-#define PRIdLEAST64  "I64d"
-#define PRIiLEAST64  "I64i"
-#define PRIdFAST64   "I64d"
-#define PRIiFAST64   "I64i"
-
-#define PRIdMAX     "I64d"
-#define PRIiMAX     "I64i"
-
-#define PRIdPTR     "Id"
-#define PRIiPTR     "Ii"
-
-// The fprintf macros for unsigned integers are:
-#define PRIo8       "o"
-#define PRIu8       "u"
-#define PRIx8       "x"
-#define PRIX8       "X"
-#define PRIoLEAST8  "o"
-#define PRIuLEAST8  "u"
-#define PRIxLEAST8  "x"
-#define PRIXLEAST8  "X"
-#define PRIoFAST8   "o"
-#define PRIuFAST8   "u"
-#define PRIxFAST8   "x"
-#define PRIXFAST8   "X"
-
-#define PRIo16       "ho"
-#define PRIu16       "hu"
-#define PRIx16       "hx"
-#define PRIX16       "hX"
-#define PRIoLEAST16  "ho"
-#define PRIuLEAST16  "hu"
-#define PRIxLEAST16  "hx"
-#define PRIXLEAST16  "hX"
-#define PRIoFAST16   "ho"
-#define PRIuFAST16   "hu"
-#define PRIxFAST16   "hx"
-#define PRIXFAST16   "hX"
-
-#define PRIo32       "I32o"
-#define PRIu32       "I32u"
-#define PRIx32       "I32x"
-#define PRIX32       "I32X"
-#define PRIoLEAST32  "I32o"
-#define PRIuLEAST32  "I32u"
-#define PRIxLEAST32  "I32x"
-#define PRIXLEAST32  "I32X"
-#define PRIoFAST32   "I32o"
-#define PRIuFAST32   "I32u"
-#define PRIxFAST32   "I32x"
-#define PRIXFAST32   "I32X"
-
-#define PRIo64       "I64o"
-#define PRIu64       "I64u"
-#define PRIx64       "I64x"
-#define PRIX64       "I64X"
-#define PRIoLEAST64  "I64o"
-#define PRIuLEAST64  "I64u"
-#define PRIxLEAST64  "I64x"
-#define PRIXLEAST64  "I64X"
-#define PRIoFAST64   "I64o"
-#define PRIuFAST64   "I64u"
-#define PRIxFAST64   "I64x"
-#define PRIXFAST64   "I64X"
-
-#define PRIoMAX     "I64o"
-#define PRIuMAX     "I64u"
-#define PRIxMAX     "I64x"
-#define PRIXMAX     "I64X"
-
-#define PRIoPTR     "Io"
-#define PRIuPTR     "Iu"
-#define PRIxPTR     "Ix"
-#define PRIXPTR     "IX"
-
-// The fscanf macros for signed integers are:
-#define SCNd8       "d"
-#define SCNi8       "i"
-#define SCNdLEAST8  "d"
-#define SCNiLEAST8  "i"
-#define SCNdFAST8   "d"
-#define SCNiFAST8   "i"
-
-#define SCNd16       "hd"
-#define SCNi16       "hi"
-#define SCNdLEAST16  "hd"
-#define SCNiLEAST16  "hi"
-#define SCNdFAST16   "hd"
-#define SCNiFAST16   "hi"
-
-#define SCNd32       "ld"
-#define SCNi32       "li"
-#define SCNdLEAST32  "ld"
-#define SCNiLEAST32  "li"
-#define SCNdFAST32   "ld"
-#define SCNiFAST32   "li"
-
-#define SCNd64       "I64d"
-#define SCNi64       "I64i"
-#define SCNdLEAST64  "I64d"
-#define SCNiLEAST64  "I64i"
-#define SCNdFAST64   "I64d"
-#define SCNiFAST64   "I64i"
-
-#define SCNdMAX     "I64d"
-#define SCNiMAX     "I64i"
-
-#ifdef _WIN64 // [
-#  define SCNdPTR     "I64d"
-#  define SCNiPTR     "I64i"
-#else  // _WIN64 ][
-#  define SCNdPTR     "ld"
-#  define SCNiPTR     "li"
-#endif  // _WIN64 ]
-
-// The fscanf macros for unsigned integers are:
-#define SCNo8       "o"
-#define SCNu8       "u"
-#define SCNx8       "x"
-#define SCNX8       "X"
-#define SCNoLEAST8  "o"
-#define SCNuLEAST8  "u"
-#define SCNxLEAST8  "x"
-#define SCNXLEAST8  "X"
-#define SCNoFAST8   "o"
-#define SCNuFAST8   "u"
-#define SCNxFAST8   "x"
-#define SCNXFAST8   "X"
-
-#define SCNo16       "ho"
-#define SCNu16       "hu"
-#define SCNx16       "hx"
-#define SCNX16       "hX"
-#define SCNoLEAST16  "ho"
-#define SCNuLEAST16  "hu"
-#define SCNxLEAST16  "hx"
-#define SCNXLEAST16  "hX"
-#define SCNoFAST16   "ho"
-#define SCNuFAST16   "hu"
-#define SCNxFAST16   "hx"
-#define SCNXFAST16   "hX"
-
-#define SCNo32       "lo"
-#define SCNu32       "lu"
-#define SCNx32       "lx"
-#define SCNX32       "lX"
-#define SCNoLEAST32  "lo"
-#define SCNuLEAST32  "lu"
-#define SCNxLEAST32  "lx"
-#define SCNXLEAST32  "lX"
-#define SCNoFAST32   "lo"
-#define SCNuFAST32   "lu"
-#define SCNxFAST32   "lx"
-#define SCNXFAST32   "lX"
-
-#define SCNo64       "I64o"
-#define SCNu64       "I64u"
-#define SCNx64       "I64x"
-#define SCNX64       "I64X"
-#define SCNoLEAST64  "I64o"
-#define SCNuLEAST64  "I64u"
-#define SCNxLEAST64  "I64x"
-#define SCNXLEAST64  "I64X"
-#define SCNoFAST64   "I64o"
-#define SCNuFAST64   "I64u"
-#define SCNxFAST64   "I64x"
-#define SCNXFAST64   "I64X"
-
-#define SCNoMAX     "I64o"
-#define SCNuMAX     "I64u"
-#define SCNxMAX     "I64x"
-#define SCNXMAX     "I64X"
-
-#ifdef _WIN64 // [
-#  define SCNoPTR     "I64o"
-#  define SCNuPTR     "I64u"
-#  define SCNxPTR     "I64x"
-#  define SCNXPTR     "I64X"
-#else  // _WIN64 ][
-#  define SCNoPTR     "lo"
-#  define SCNuPTR     "lu"
-#  define SCNxPTR     "lx"
-#  define SCNXPTR     "lX"
-#endif  // _WIN64 ]
-
-#endif // __STDC_FORMAT_MACROS ]
-
-// 7.8.2 Functions for greatest-width integer types
-
-// 7.8.2.1 The imaxabs function
-#define imaxabs _abs64
-
-// 7.8.2.2 The imaxdiv function
-
-// This is modified version of div() function from Microsoft's div.c found
-// in %MSVC.NET%\crt\src\div.c
-#ifdef STATIC_IMAXDIV // [
-static
-#else // STATIC_IMAXDIV ][
-_inline
-#endif // STATIC_IMAXDIV ]
-imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
-{
-   imaxdiv_t result;
-
-   result.quot = numer / denom;
-   result.rem = numer % denom;
-
-   if (numer < 0 && result.rem > 0) {
-      // did division wrong; must fix up
-      ++result.quot;
-      result.rem -= denom;
-   }
-
-   return result;
-}
-
-// 7.8.2.3 The strtoimax and strtoumax functions
-#define strtoimax _strtoi64
-#define strtoumax _strtoui64
-
-// 7.8.2.4 The wcstoimax and wcstoumax functions
-#define wcstoimax _wcstoi64
-#define wcstoumax _wcstoui64
-
-
-#endif // _MSC_INTTYPES_H_ ]
+// ISO C9x  compliant inttypes.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_INTTYPES_H_ // [
+#define _MSC_INTTYPES_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include "stdint.h"
+
+// 7.8 Format conversion of integer types
+
+typedef struct {
+   intmax_t quot;
+   intmax_t rem;
+} imaxdiv_t;
+
+// 7.8.1 Macros for format specifiers
+
+#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [   See footnote 185 at page 198
+
+// The fprintf macros for signed integers are:
+#define PRId8       "d"
+#define PRIi8       "i"
+#define PRIdLEAST8  "d"
+#define PRIiLEAST8  "i"
+#define PRIdFAST8   "d"
+#define PRIiFAST8   "i"
+
+#define PRId16       "hd"
+#define PRIi16       "hi"
+#define PRIdLEAST16  "hd"
+#define PRIiLEAST16  "hi"
+#define PRIdFAST16   "hd"
+#define PRIiFAST16   "hi"
+
+#define PRId32       "I32d"
+#define PRIi32       "I32i"
+#define PRIdLEAST32  "I32d"
+#define PRIiLEAST32  "I32i"
+#define PRIdFAST32   "I32d"
+#define PRIiFAST32   "I32i"
+
+#define PRId64       "I64d"
+#define PRIi64       "I64i"
+#define PRIdLEAST64  "I64d"
+#define PRIiLEAST64  "I64i"
+#define PRIdFAST64   "I64d"
+#define PRIiFAST64   "I64i"
+
+#define PRIdMAX     "I64d"
+#define PRIiMAX     "I64i"
+
+#define PRIdPTR     "Id"
+#define PRIiPTR     "Ii"
+
+// The fprintf macros for unsigned integers are:
+#define PRIo8       "o"
+#define PRIu8       "u"
+#define PRIx8       "x"
+#define PRIX8       "X"
+#define PRIoLEAST8  "o"
+#define PRIuLEAST8  "u"
+#define PRIxLEAST8  "x"
+#define PRIXLEAST8  "X"
+#define PRIoFAST8   "o"
+#define PRIuFAST8   "u"
+#define PRIxFAST8   "x"
+#define PRIXFAST8   "X"
+
+#define PRIo16       "ho"
+#define PRIu16       "hu"
+#define PRIx16       "hx"
+#define PRIX16       "hX"
+#define PRIoLEAST16  "ho"
+#define PRIuLEAST16  "hu"
+#define PRIxLEAST16  "hx"
+#define PRIXLEAST16  "hX"
+#define PRIoFAST16   "ho"
+#define PRIuFAST16   "hu"
+#define PRIxFAST16   "hx"
+#define PRIXFAST16   "hX"
+
+#define PRIo32       "I32o"
+#define PRIu32       "I32u"
+#define PRIx32       "I32x"
+#define PRIX32       "I32X"
+#define PRIoLEAST32  "I32o"
+#define PRIuLEAST32  "I32u"
+#define PRIxLEAST32  "I32x"
+#define PRIXLEAST32  "I32X"
+#define PRIoFAST32   "I32o"
+#define PRIuFAST32   "I32u"
+#define PRIxFAST32   "I32x"
+#define PRIXFAST32   "I32X"
+
+#define PRIo64       "I64o"
+#define PRIu64       "I64u"
+#define PRIx64       "I64x"
+#define PRIX64       "I64X"
+#define PRIoLEAST64  "I64o"
+#define PRIuLEAST64  "I64u"
+#define PRIxLEAST64  "I64x"
+#define PRIXLEAST64  "I64X"
+#define PRIoFAST64   "I64o"
+#define PRIuFAST64   "I64u"
+#define PRIxFAST64   "I64x"
+#define PRIXFAST64   "I64X"
+
+#define PRIoMAX     "I64o"
+#define PRIuMAX     "I64u"
+#define PRIxMAX     "I64x"
+#define PRIXMAX     "I64X"
+
+#define PRIoPTR     "Io"
+#define PRIuPTR     "Iu"
+#define PRIxPTR     "Ix"
+#define PRIXPTR     "IX"
+
+// The fscanf macros for signed integers are:
+#define SCNd8       "d"
+#define SCNi8       "i"
+#define SCNdLEAST8  "d"
+#define SCNiLEAST8  "i"
+#define SCNdFAST8   "d"
+#define SCNiFAST8   "i"
+
+#define SCNd16       "hd"
+#define SCNi16       "hi"
+#define SCNdLEAST16  "hd"
+#define SCNiLEAST16  "hi"
+#define SCNdFAST16   "hd"
+#define SCNiFAST16   "hi"
+
+#define SCNd32       "ld"
+#define SCNi32       "li"
+#define SCNdLEAST32  "ld"
+#define SCNiLEAST32  "li"
+#define SCNdFAST32   "ld"
+#define SCNiFAST32   "li"
+
+#define SCNd64       "I64d"
+#define SCNi64       "I64i"
+#define SCNdLEAST64  "I64d"
+#define SCNiLEAST64  "I64i"
+#define SCNdFAST64   "I64d"
+#define SCNiFAST64   "I64i"
+
+#define SCNdMAX     "I64d"
+#define SCNiMAX     "I64i"
+
+#ifdef _WIN64 // [
+#  define SCNdPTR     "I64d"
+#  define SCNiPTR     "I64i"
+#else  // _WIN64 ][
+#  define SCNdPTR     "ld"
+#  define SCNiPTR     "li"
+#endif  // _WIN64 ]
+
+// The fscanf macros for unsigned integers are:
+#define SCNo8       "o"
+#define SCNu8       "u"
+#define SCNx8       "x"
+#define SCNX8       "X"
+#define SCNoLEAST8  "o"
+#define SCNuLEAST8  "u"
+#define SCNxLEAST8  "x"
+#define SCNXLEAST8  "X"
+#define SCNoFAST8   "o"
+#define SCNuFAST8   "u"
+#define SCNxFAST8   "x"
+#define SCNXFAST8   "X"
+
+#define SCNo16       "ho"
+#define SCNu16       "hu"
+#define SCNx16       "hx"
+#define SCNX16       "hX"
+#define SCNoLEAST16  "ho"
+#define SCNuLEAST16  "hu"
+#define SCNxLEAST16  "hx"
+#define SCNXLEAST16  "hX"
+#define SCNoFAST16   "ho"
+#define SCNuFAST16   "hu"
+#define SCNxFAST16   "hx"
+#define SCNXFAST16   "hX"
+
+#define SCNo32       "lo"
+#define SCNu32       "lu"
+#define SCNx32       "lx"
+#define SCNX32       "lX"
+#define SCNoLEAST32  "lo"
+#define SCNuLEAST32  "lu"
+#define SCNxLEAST32  "lx"
+#define SCNXLEAST32  "lX"
+#define SCNoFAST32   "lo"
+#define SCNuFAST32   "lu"
+#define SCNxFAST32   "lx"
+#define SCNXFAST32   "lX"
+
+#define SCNo64       "I64o"
+#define SCNu64       "I64u"
+#define SCNx64       "I64x"
+#define SCNX64       "I64X"
+#define SCNoLEAST64  "I64o"
+#define SCNuLEAST64  "I64u"
+#define SCNxLEAST64  "I64x"
+#define SCNXLEAST64  "I64X"
+#define SCNoFAST64   "I64o"
+#define SCNuFAST64   "I64u"
+#define SCNxFAST64   "I64x"
+#define SCNXFAST64   "I64X"
+
+#define SCNoMAX     "I64o"
+#define SCNuMAX     "I64u"
+#define SCNxMAX     "I64x"
+#define SCNXMAX     "I64X"
+
+#ifdef _WIN64 // [
+#  define SCNoPTR     "I64o"
+#  define SCNuPTR     "I64u"
+#  define SCNxPTR     "I64x"
+#  define SCNXPTR     "I64X"
+#else  // _WIN64 ][
+#  define SCNoPTR     "lo"
+#  define SCNuPTR     "lu"
+#  define SCNxPTR     "lx"
+#  define SCNXPTR     "lX"
+#endif  // _WIN64 ]
+
+#endif // __STDC_FORMAT_MACROS ]
+
+// 7.8.2 Functions for greatest-width integer types
+
+// 7.8.2.1 The imaxabs function
+#define imaxabs _abs64
+
+// 7.8.2.2 The imaxdiv function
+
+// This is modified version of div() function from Microsoft's div.c found
+// in %MSVC.NET%\crt\src\div.c
+#ifdef STATIC_IMAXDIV // [
+static
+#else // STATIC_IMAXDIV ][
+_inline
+#endif // STATIC_IMAXDIV ]
+imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
+{
+   imaxdiv_t result;
+
+   result.quot = numer / denom;
+   result.rem = numer % denom;
+
+   if (numer < 0 && result.rem > 0) {
+      // did division wrong; must fix up
+      ++result.quot;
+      result.rem -= denom;
+   }
+
+   return result;
+}
+
+// 7.8.2.3 The strtoimax and strtoumax functions
+#define strtoimax _strtoi64
+#define strtoumax _strtoui64
+
+// 7.8.2.4 The wcstoimax and wcstoumax functions
+#define wcstoimax _wcstoi64
+#define wcstoumax _wcstoui64
+
+
+#endif // _MSC_INTTYPES_H_ ]
diff --git a/include/compat/msvc/stdint.h b/include/compat/msvc/stdint.h
index 59d0673..d02608a 100644
--- a/include/compat/msvc/stdint.h
+++ b/include/compat/msvc/stdint.h
@@ -1,247 +1,247 @@
-// ISO C9x  compliant stdint.h for Microsoft Visual Studio
-// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
-// 
-//  Copyright (c) 2006-2008 Alexander Chemeris
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// 
-//   1. Redistributions of source code must retain the above copyright notice,
-//      this list of conditions and the following disclaimer.
-// 
-//   2. Redistributions in binary form must reproduce the above copyright
-//      notice, this list of conditions and the following disclaimer in the
-//      documentation and/or other materials provided with the distribution.
-// 
-//   3. The name of the author may be used to endorse or promote products
-//      derived from this software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// 
-///////////////////////////////////////////////////////////////////////////////
-
-#ifndef _MSC_VER // [
-#error "Use this header only with Microsoft Visual C++ compilers!"
-#endif // _MSC_VER ]
-
-#ifndef _MSC_STDINT_H_ // [
-#define _MSC_STDINT_H_
-
-#if _MSC_VER > 1000
-#pragma once
-#endif
-
-#include <limits.h>
-
-// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
-// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
-// or compiler give many errors like this:
-//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
-#ifdef __cplusplus
-extern "C" {
-#endif
-#  include <wchar.h>
-#ifdef __cplusplus
-}
-#endif
-
-// Define _W64 macros to mark types changing their size, like intptr_t.
-#ifndef _W64
-#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
-#     define _W64 __w64
-#  else
-#     define _W64
-#  endif
-#endif
-
-
-// 7.18.1 Integer types
-
-// 7.18.1.1 Exact-width integer types
-
-// Visual Studio 6 and Embedded Visual C++ 4 doesn't
-// realize that, e.g. char has the same size as __int8
-// so we give up on __intX for them.
-#if (_MSC_VER < 1300)
-   typedef signed char       int8_t;
-   typedef signed short      int16_t;
-   typedef signed int        int32_t;
-   typedef unsigned char     uint8_t;
-   typedef unsigned short    uint16_t;
-   typedef unsigned int      uint32_t;
-#else
-   typedef signed __int8     int8_t;
-   typedef signed __int16    int16_t;
-   typedef signed __int32    int32_t;
-   typedef unsigned __int8   uint8_t;
-   typedef unsigned __int16  uint16_t;
-   typedef unsigned __int32  uint32_t;
-#endif
-typedef signed __int64       int64_t;
-typedef unsigned __int64     uint64_t;
-
-
-// 7.18.1.2 Minimum-width integer types
-typedef int8_t    int_least8_t;
-typedef int16_t   int_least16_t;
-typedef int32_t   int_least32_t;
-typedef int64_t   int_least64_t;
-typedef uint8_t   uint_least8_t;
-typedef uint16_t  uint_least16_t;
-typedef uint32_t  uint_least32_t;
-typedef uint64_t  uint_least64_t;
-
-// 7.18.1.3 Fastest minimum-width integer types
-typedef int8_t    int_fast8_t;
-typedef int16_t   int_fast16_t;
-typedef int32_t   int_fast32_t;
-typedef int64_t   int_fast64_t;
-typedef uint8_t   uint_fast8_t;
-typedef uint16_t  uint_fast16_t;
-typedef uint32_t  uint_fast32_t;
-typedef uint64_t  uint_fast64_t;
-
-// 7.18.1.4 Integer types capable of holding object pointers
-#ifdef _WIN64 // [
-   typedef signed __int64    intptr_t;
-   typedef unsigned __int64  uintptr_t;
-#else // _WIN64 ][
-   typedef _W64 signed int   intptr_t;
-   typedef _W64 unsigned int uintptr_t;
-#endif // _WIN64 ]
-
-// 7.18.1.5 Greatest-width integer types
-typedef int64_t   intmax_t;
-typedef uint64_t  uintmax_t;
-
-
-// 7.18.2 Limits of specified-width integer types
-
-#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
-
-// 7.18.2.1 Limits of exact-width integer types
-#define INT8_MIN     ((int8_t)_I8_MIN)
-#define INT8_MAX     _I8_MAX
-#define INT16_MIN    ((int16_t)_I16_MIN)
-#define INT16_MAX    _I16_MAX
-#define INT32_MIN    ((int32_t)_I32_MIN)
-#define INT32_MAX    _I32_MAX
-#define INT64_MIN    ((int64_t)_I64_MIN)
-#define INT64_MAX    _I64_MAX
-#define UINT8_MAX    _UI8_MAX
-#define UINT16_MAX   _UI16_MAX
-#define UINT32_MAX   _UI32_MAX
-#define UINT64_MAX   _UI64_MAX
-
-// 7.18.2.2 Limits of minimum-width integer types
-#define INT_LEAST8_MIN    INT8_MIN
-#define INT_LEAST8_MAX    INT8_MAX
-#define INT_LEAST16_MIN   INT16_MIN
-#define INT_LEAST16_MAX   INT16_MAX
-#define INT_LEAST32_MIN   INT32_MIN
-#define INT_LEAST32_MAX   INT32_MAX
-#define INT_LEAST64_MIN   INT64_MIN
-#define INT_LEAST64_MAX   INT64_MAX
-#define UINT_LEAST8_MAX   UINT8_MAX
-#define UINT_LEAST16_MAX  UINT16_MAX
-#define UINT_LEAST32_MAX  UINT32_MAX
-#define UINT_LEAST64_MAX  UINT64_MAX
-
-// 7.18.2.3 Limits of fastest minimum-width integer types
-#define INT_FAST8_MIN    INT8_MIN
-#define INT_FAST8_MAX    INT8_MAX
-#define INT_FAST16_MIN   INT16_MIN
-#define INT_FAST16_MAX   INT16_MAX
-#define INT_FAST32_MIN   INT32_MIN
-#define INT_FAST32_MAX   INT32_MAX
-#define INT_FAST64_MIN   INT64_MIN
-#define INT_FAST64_MAX   INT64_MAX
-#define UINT_FAST8_MAX   UINT8_MAX
-#define UINT_FAST16_MAX  UINT16_MAX
-#define UINT_FAST32_MAX  UINT32_MAX
-#define UINT_FAST64_MAX  UINT64_MAX
-
-// 7.18.2.4 Limits of integer types capable of holding object pointers
-#ifdef _WIN64 // [
-#  define INTPTR_MIN   INT64_MIN
-#  define INTPTR_MAX   INT64_MAX
-#  define UINTPTR_MAX  UINT64_MAX
-#else // _WIN64 ][
-#  define INTPTR_MIN   INT32_MIN
-#  define INTPTR_MAX   INT32_MAX
-#  define UINTPTR_MAX  UINT32_MAX
-#endif // _WIN64 ]
-
-// 7.18.2.5 Limits of greatest-width integer types
-#define INTMAX_MIN   INT64_MIN
-#define INTMAX_MAX   INT64_MAX
-#define UINTMAX_MAX  UINT64_MAX
-
-// 7.18.3 Limits of other integer types
-
-#ifdef _WIN64 // [
-#  define PTRDIFF_MIN  _I64_MIN
-#  define PTRDIFF_MAX  _I64_MAX
-#else  // _WIN64 ][
-#  define PTRDIFF_MIN  _I32_MIN
-#  define PTRDIFF_MAX  _I32_MAX
-#endif  // _WIN64 ]
-
-#define SIG_ATOMIC_MIN  INT_MIN
-#define SIG_ATOMIC_MAX  INT_MAX
-
-#ifndef SIZE_MAX // [
-#  ifdef _WIN64 // [
-#     define SIZE_MAX  _UI64_MAX
-#  else // _WIN64 ][
-#     define SIZE_MAX  _UI32_MAX
-#  endif // _WIN64 ]
-#endif // SIZE_MAX ]
-
-// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
-#ifndef WCHAR_MIN // [
-#  define WCHAR_MIN  0
-#endif  // WCHAR_MIN ]
-#ifndef WCHAR_MAX // [
-#  define WCHAR_MAX  _UI16_MAX
-#endif  // WCHAR_MAX ]
-
-#define WINT_MIN  0
-#define WINT_MAX  _UI16_MAX
-
-#endif // __STDC_LIMIT_MACROS ]
-
-
-// 7.18.4 Limits of other integer types
-
-#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
-
-// 7.18.4.1 Macros for minimum-width integer constants
-
-#define INT8_C(val)  val##i8
-#define INT16_C(val) val##i16
-#define INT32_C(val) val##i32
-#define INT64_C(val) val##i64
-
-#define UINT8_C(val)  val##ui8
-#define UINT16_C(val) val##ui16
-#define UINT32_C(val) val##ui32
-#define UINT64_C(val) val##ui64
-
-// 7.18.4.2 Macros for greatest-width integer constants
-#define INTMAX_C   INT64_C
-#define UINTMAX_C  UINT64_C
-
-#endif // __STDC_CONSTANT_MACROS ]
-
-
-#endif // _MSC_STDINT_H_ ]
+// ISO C9x  compliant stdint.h for Microsoft Visual Studio
+// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 
+// 
+//  Copyright (c) 2006-2008 Alexander Chemeris
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// 
+//   1. Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimer.
+// 
+//   2. Redistributions in binary form must reproduce the above copyright
+//      notice, this list of conditions and the following disclaimer in the
+//      documentation and/or other materials provided with the distribution.
+// 
+//   3. The name of the author may be used to endorse or promote products
+//      derived from this software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// 
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_STDINT_H_ // [
+#define _MSC_STDINT_H_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif
+
+#include <limits.h>
+
+// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
+// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
+// or compiler give many errors like this:
+//   error C2733: second C linkage of overloaded function 'wmemchr' not allowed
+#ifdef __cplusplus
+extern "C" {
+#endif
+#  include <wchar.h>
+#ifdef __cplusplus
+}
+#endif
+
+// Define _W64 macros to mark types changing their size, like intptr_t.
+#ifndef _W64
+#  if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
+#     define _W64 __w64
+#  else
+#     define _W64
+#  endif
+#endif
+
+
+// 7.18.1 Integer types
+
+// 7.18.1.1 Exact-width integer types
+
+// Visual Studio 6 and Embedded Visual C++ 4 doesn't
+// realize that, e.g. char has the same size as __int8
+// so we give up on __intX for them.
+#if (_MSC_VER < 1300)
+   typedef signed char       int8_t;
+   typedef signed short      int16_t;
+   typedef signed int        int32_t;
+   typedef unsigned char     uint8_t;
+   typedef unsigned short    uint16_t;
+   typedef unsigned int      uint32_t;
+#else
+   typedef signed __int8     int8_t;
+   typedef signed __int16    int16_t;
+   typedef signed __int32    int32_t;
+   typedef unsigned __int8   uint8_t;
+   typedef unsigned __int16  uint16_t;
+   typedef unsigned __int32  uint32_t;
+#endif
+typedef signed __int64       int64_t;
+typedef unsigned __int64     uint64_t;
+
+
+// 7.18.1.2 Minimum-width integer types
+typedef int8_t    int_least8_t;
+typedef int16_t   int_least16_t;
+typedef int32_t   int_least32_t;
+typedef int64_t   int_least64_t;
+typedef uint8_t   uint_least8_t;
+typedef uint16_t  uint_least16_t;
+typedef uint32_t  uint_least32_t;
+typedef uint64_t  uint_least64_t;
+
+// 7.18.1.3 Fastest minimum-width integer types
+typedef int8_t    int_fast8_t;
+typedef int16_t   int_fast16_t;
+typedef int32_t   int_fast32_t;
+typedef int64_t   int_fast64_t;
+typedef uint8_t   uint_fast8_t;
+typedef uint16_t  uint_fast16_t;
+typedef uint32_t  uint_fast32_t;
+typedef uint64_t  uint_fast64_t;
+
+// 7.18.1.4 Integer types capable of holding object pointers
+#ifdef _WIN64 // [
+   typedef signed __int64    intptr_t;
+   typedef unsigned __int64  uintptr_t;
+#else // _WIN64 ][
+   typedef _W64 signed int   intptr_t;
+   typedef _W64 unsigned int uintptr_t;
+#endif // _WIN64 ]
+
+// 7.18.1.5 Greatest-width integer types
+typedef int64_t   intmax_t;
+typedef uint64_t  uintmax_t;
+
+
+// 7.18.2 Limits of specified-width integer types
+
+#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [   See footnote 220 at page 257 and footnote 221 at page 259
+
+// 7.18.2.1 Limits of exact-width integer types
+#define INT8_MIN     ((int8_t)_I8_MIN)
+#define INT8_MAX     _I8_MAX
+#define INT16_MIN    ((int16_t)_I16_MIN)
+#define INT16_MAX    _I16_MAX
+#define INT32_MIN    ((int32_t)_I32_MIN)
+#define INT32_MAX    _I32_MAX
+#define INT64_MIN    ((int64_t)_I64_MIN)
+#define INT64_MAX    _I64_MAX
+#define UINT8_MAX    _UI8_MAX
+#define UINT16_MAX   _UI16_MAX
+#define UINT32_MAX   _UI32_MAX
+#define UINT64_MAX   _UI64_MAX
+
+// 7.18.2.2 Limits of minimum-width integer types
+#define INT_LEAST8_MIN    INT8_MIN
+#define INT_LEAST8_MAX    INT8_MAX
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define INT_LEAST32_MIN   INT32_MIN
+#define INT_LEAST32_MAX   INT32_MAX
+#define INT_LEAST64_MIN   INT64_MIN
+#define INT_LEAST64_MAX   INT64_MAX
+#define UINT_LEAST8_MAX   UINT8_MAX
+#define UINT_LEAST16_MAX  UINT16_MAX
+#define UINT_LEAST32_MAX  UINT32_MAX
+#define UINT_LEAST64_MAX  UINT64_MAX
+
+// 7.18.2.3 Limits of fastest minimum-width integer types
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define INT_FAST16_MIN   INT16_MIN
+#define INT_FAST16_MAX   INT16_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define INT_FAST64_MIN   INT64_MIN
+#define INT_FAST64_MAX   INT64_MAX
+#define UINT_FAST8_MAX   UINT8_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+
+// 7.18.2.4 Limits of integer types capable of holding object pointers
+#ifdef _WIN64 // [
+#  define INTPTR_MIN   INT64_MIN
+#  define INTPTR_MAX   INT64_MAX
+#  define UINTPTR_MAX  UINT64_MAX
+#else // _WIN64 ][
+#  define INTPTR_MIN   INT32_MIN
+#  define INTPTR_MAX   INT32_MAX
+#  define UINTPTR_MAX  UINT32_MAX
+#endif // _WIN64 ]
+
+// 7.18.2.5 Limits of greatest-width integer types
+#define INTMAX_MIN   INT64_MIN
+#define INTMAX_MAX   INT64_MAX
+#define UINTMAX_MAX  UINT64_MAX
+
+// 7.18.3 Limits of other integer types
+
+#ifdef _WIN64 // [
+#  define PTRDIFF_MIN  _I64_MIN
+#  define PTRDIFF_MAX  _I64_MAX
+#else  // _WIN64 ][
+#  define PTRDIFF_MIN  _I32_MIN
+#  define PTRDIFF_MAX  _I32_MAX
+#endif  // _WIN64 ]
+
+#define SIG_ATOMIC_MIN  INT_MIN
+#define SIG_ATOMIC_MAX  INT_MAX
+
+#ifndef SIZE_MAX // [
+#  ifdef _WIN64 // [
+#     define SIZE_MAX  _UI64_MAX
+#  else // _WIN64 ][
+#     define SIZE_MAX  _UI32_MAX
+#  endif // _WIN64 ]
+#endif // SIZE_MAX ]
+
+// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
+#ifndef WCHAR_MIN // [
+#  define WCHAR_MIN  0
+#endif  // WCHAR_MIN ]
+#ifndef WCHAR_MAX // [
+#  define WCHAR_MAX  _UI16_MAX
+#endif  // WCHAR_MAX ]
+
+#define WINT_MIN  0
+#define WINT_MAX  _UI16_MAX
+
+#endif // __STDC_LIMIT_MACROS ]
+
+
+// 7.18.4 Limits of other integer types
+
+#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [   See footnote 224 at page 260
+
+// 7.18.4.1 Macros for minimum-width integer constants
+
+#define INT8_C(val)  val##i8
+#define INT16_C(val) val##i16
+#define INT32_C(val) val##i32
+#define INT64_C(val) val##i64
+
+#define UINT8_C(val)  val##ui8
+#define UINT16_C(val) val##ui16
+#define UINT32_C(val) val##ui32
+#define UINT64_C(val) val##ui64
+
+// 7.18.4.2 Macros for greatest-width integer constants
+#define INTMAX_C   INT64_C
+#define UINTMAX_C  UINT64_C
+
+#endif // __STDC_CONSTANT_MACROS ]
+
+
+#endif // _MSC_STDINT_H_ ]
diff --git a/include/compat/nacl/memory.h b/include/compat/nacl/memory.h
index 8e69c1f..3b2f590 100644
--- a/include/compat/nacl/memory.h
+++ b/include/compat/nacl/memory.h
@@ -1 +1 @@
-#include <string.h>
+#include <string.h>
diff --git a/premake/bx.lua b/premake/bx.lua
index 53d1cb8..ce1fd0d 100644
--- a/premake/bx.lua
+++ b/premake/bx.lua
@@ -1,7 +1,7 @@
-project "bx"
-	uuid "4db0b09e-d6df-11e1-a0ec-65ccdd6a022f"
-	kind "StaticLib"
-
-	files {
-		"../include/**.h",
-	}
+project "bx"
+	uuid "4db0b09e-d6df-11e1-a0ec-65ccdd6a022f"
+	kind "StaticLib"
+
+	files {
+		"../include/**.h",
+	}