diff --git a/include/bx/endian.h b/include/bx/endian.h
index 23e9a66..c687d4f 100644
--- a/include/bx/endian.h
+++ b/include/bx/endian.h
@@ -10,78 +10,41 @@
 
 namespace bx
 {
-	inline uint16_t endianSwap(uint16_t _in)
-	{
-		return (_in>>8) | (_in<<8);
-	}
-	
-	inline uint32_t endianSwap(uint32_t _in)
-	{
-		return (_in>>24) | (_in<<24)
-			 | ( (_in&0x00ff0000)>>8) | ( (_in&0x0000ff00)<<8)
-			 ;
-	}
+	///
+	int16_t endianSwap(int16_t _in);
 
-	inline uint64_t endianSwap(uint64_t _in)
-	{
-		return (_in>>56) | (_in<<56)
-			 | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40)
-			 | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24)
-			 | ( (_in&UINT64_C(0x000000ff00000000) )>>8)  | ( (_in&UINT64_C(0x00000000ff000000) )<<8)
-			 ;
-	}
+	///
+	uint16_t endianSwap(uint16_t _in);
 
-	inline int16_t endianSwap(int16_t _in)
-	{
-		return (int16_t)endianSwap( (uint16_t)_in);
-	}
+	///
+	int32_t endianSwap(int32_t _in);
 
-	inline int32_t endianSwap(int32_t _in)
-	{
-		return (int32_t)endianSwap( (uint32_t)_in);
-	}
+	///
+	uint32_t endianSwap(uint32_t _in);
 
-	inline int64_t endianSwap(int64_t _in)
-	{
-		return (int64_t)endianSwap( (uint64_t)_in);
-	}
+	///
+	int64_t endianSwap(int64_t _in);
+
+	///
+	uint64_t endianSwap(uint64_t _in);
 
 	/// Input argument is encoded as little endian, convert it if neccessary
 	/// depending on host CPU endianess.
 	template <typename Ty>
-	inline Ty toLittleEndian(const Ty _in)
-	{
-#if BX_CPU_ENDIAN_BIG
-		return endianSwap(_in);
-#else
-		return _in;
-#endif // BX_CPU_ENDIAN_BIG
-	}
+	Ty toLittleEndian(const Ty _in);
 
 	/// Input argument is encoded as big endian, convert it if neccessary
 	/// depending on host CPU endianess.
 	template <typename Ty>
-	inline Ty toBigEndian(const Ty _in)
-	{
-#if BX_CPU_ENDIAN_LITTLE
-		return endianSwap(_in);
-#else
-		return _in;
-#endif // BX_CPU_ENDIAN_LITTLE
-	}
+	Ty toBigEndian(const Ty _in);
 
 	/// If _littleEndian is true, converts input argument to from little endian
 	/// to host CPU endiness.
 	template <typename Ty>
-	inline Ty toHostEndian(const Ty _in, bool _fromLittleEndian)
-	{
-#if BX_CPU_ENDIAN_LITTLE
-		return _fromLittleEndian ? _in : endianSwap(_in);
-#else
-		return _fromLittleEndian ? endianSwap(_in) : _in;
-#endif // BX_CPU_ENDIAN_LITTLE
-	}
+	Ty toHostEndian(const Ty _in, bool _fromLittleEndian);
 
 } // namespace bx
 
+#include "endian.inl"
+
 #endif // BX_ENDIAN_H_HEADER_GUARD
diff --git a/include/bx/endian.inl b/include/bx/endian.inl
new file mode 100644
index 0000000..6ec7572
--- /dev/null
+++ b/include/bx/endian.inl
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+ */
+
+#ifndef BX_ENDIAN_H_HEADER_GUARD
+#	error "Must be included from bx/endian.h!"
+#endif // BX_ENDIAN_H_HEADER_GUARD
+
+namespace bx
+{
+	inline int16_t endianSwap(int16_t _in)
+	{
+		return (int16_t)endianSwap( (uint16_t)_in);
+	}
+
+	inline uint16_t endianSwap(uint16_t _in)
+	{
+		return (_in>>8) | (_in<<8);
+	}
+
+	inline int32_t endianSwap(int32_t _in)
+	{
+		return (int32_t)endianSwap( (uint32_t)_in);
+	}
+
+	inline uint32_t endianSwap(uint32_t _in)
+	{
+		return (  _in            >>24) | (  _in            <<24)
+			 | ( (_in&0x00ff0000)>> 8) | ( (_in&0x0000ff00)<< 8)
+			 ;
+	}
+
+	inline int64_t endianSwap(int64_t _in)
+	{
+		return (int64_t)endianSwap( (uint64_t)_in);
+	}
+
+	inline uint64_t endianSwap(uint64_t _in)
+	{
+		return   (_in                               >>56) | (  _in                               <<56)
+			 | ( (_in&UINT64_C(0x00ff000000000000) )>>40) | ( (_in&UINT64_C(0x000000000000ff00) )<<40)
+			 | ( (_in&UINT64_C(0x0000ff0000000000) )>>24) | ( (_in&UINT64_C(0x0000000000ff0000) )<<24)
+			 | ( (_in&UINT64_C(0x000000ff00000000) )>> 8) | ( (_in&UINT64_C(0x00000000ff000000) )<< 8)
+			 ;
+	}
+
+	template <typename Ty>
+	inline Ty toLittleEndian(const Ty _in)
+	{
+#if BX_CPU_ENDIAN_BIG
+		return endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_BIG
+	}
+
+	template <typename Ty>
+	inline Ty toBigEndian(const Ty _in)
+	{
+#if BX_CPU_ENDIAN_LITTLE
+		return endianSwap(_in);
+#else
+		return _in;
+#endif // BX_CPU_ENDIAN_LITTLE
+	}
+
+	template <typename Ty>
+	inline Ty toHostEndian(const Ty _in, bool _fromLittleEndian)
+	{
+#if BX_CPU_ENDIAN_LITTLE
+		return _fromLittleEndian ? _in : endianSwap(_in);
+#else
+		return _fromLittleEndian ? endianSwap(_in) : _in;
+#endif // BX_CPU_ENDIAN_LITTLE
+	}
+
+} // namespace bx
diff --git a/include/bx/rng.h b/include/bx/rng.h
index cff323c..91eade6 100644
--- a/include/bx/rng.h
+++ b/include/bx/rng.h
@@ -12,83 +12,54 @@
 
 namespace bx
 {
-	// George Marsaglia's MWC
+	/// George Marsaglia's MWC
 	class RngMwc
 	{
 	public:
-		RngMwc(uint32_t _z = 12345, uint32_t _w = 65435)
-			: m_z(_z)
-			, m_w(_w)
-		{
-		}
+		///
+		RngMwc(uint32_t _z = 12345, uint32_t _w = 65435);
 
-		void reset(uint32_t _z = 12345, uint32_t _w = 65435)
-		{
-			m_z = _z;
-			m_w = _w;
-		}
+		///
+		void reset(uint32_t _z = 12345, uint32_t _w = 65435);
 
-		uint32_t gen()
-		{
-			m_z = 36969*(m_z&65535)+(m_z>>16);
-			m_w = 18000*(m_w&65535)+(m_w>>16);
-			return (m_z<<16)+m_w;
-		}
+		///
+		uint32_t gen();
 
 	private:
 		uint32_t m_z;
 		uint32_t m_w;
 	};
 
-	// George Marsaglia's FIB
+	/// George Marsaglia's FIB
 	class RngFib
 	{
 	public:
-		RngFib()
-			: m_a(9983651)
-			, m_b(95746118)
-		{
-		}
+		///
+		RngFib(uint32_t _a = 9983651, uint32_t _b = 95746118);
 
-		void reset()
-		{
-			m_a = 9983651;
-			m_b = 95746118;
-		}
+		///
+		void reset(uint32_t _a = 9983651, uint32_t _b = 95746118);
 
-		uint32_t gen()
-		{
-			m_b = m_a+m_b;
-			m_a = m_b-m_a;
-			return m_a;
-		}
+		///
+		uint32_t gen();
 
 	private:
 		uint32_t m_a;
 		uint32_t m_b;
 	};
 
-	// George Marsaglia's SHR3
+	/// George Marsaglia's SHR3
 	class RngShr3
 	{
 	public:
-		RngShr3(uint32_t _jsr = 34221)
-			: m_jsr(_jsr)
-		{
-		}
+		///
+		RngShr3(uint32_t _jsr = 34221);
 
-		void reset(uint32_t _jsr = 34221)
-		{
-			m_jsr = _jsr;
-		}
+		///
+		void reset(uint32_t _jsr = 34221);
 
-		uint32_t gen()
-		{
-			m_jsr ^= m_jsr<<17;
-			m_jsr ^= m_jsr>>13;
-			m_jsr ^= m_jsr<<5;
-			return m_jsr;
-		}
+		///
+		uint32_t gen();
 
 	private:
 		uint32_t m_jsr;
@@ -96,112 +67,35 @@ namespace bx
 
 	/// Returns random number between 0.0f and 1.0f.
 	template <typename Rng>
-	inline float frnd(Rng* _rng)
-	{
-		uint32_t rnd = _rng->gen() & UINT16_MAX;
-		return float(rnd) * 1.0f/float(UINT16_MAX);
-	}
+	float frnd(Rng* _rng);
 
 	/// Returns random number between -1.0f and 1.0f.
 	template <typename Rng>
-	inline float frndh(Rng* _rng)
-	{
-		return 2.0f * bx::frnd(_rng) - 1.0f;
-	}
+	float frndh(Rng* _rng);
 
 	/// Generate random point on unit circle.
 	template <typename Rng>
-	inline void randUnitCircle(float _result[3], Rng* _rng)
-	{
-		const float angle = frnd(_rng) * pi * 2.0f;
-
-		_result[0] = fcos(angle);
-		_result[1] = 0.0f;
-		_result[2] = fsin(angle);
-	}
+	void randUnitCircle(float _result[3], Rng* _rng);
 
 	/// Generate random point on unit sphere.
 	template <typename Rng>
-	inline void randUnitSphere(float _result[3], Rng* _rng)
-	{
-		const float rand0  = frnd(_rng) * 2.0f - 1.0f;
-		const float rand1  = frnd(_rng) * pi * 2.0f;
-		const float sqrtf1 = fsqrt(1.0f - rand0*rand0);
-
-		_result[0] = sqrtf1 * fcos(rand1);
-		_result[1] = sqrtf1 * fsin(rand1);
-		_result[2] = rand0;
-	}
+	void randUnitSphere(float _result[3], Rng* _rng);
 
 	/// Generate random point on unit hemisphere.
 	template <typename Ty>
-	inline void randUnitHemisphere(float _result[3], Ty* _rng, const float _normal[3])
-	{
-		float dir[3];
-		randUnitSphere(dir, _rng);
-
-		float DdotN = dir[0]*_normal[0]
-					+ dir[1]*_normal[1]
-					+ dir[2]*_normal[2]
-					;
-
-		if (0.0f > DdotN)
-		{
-			dir[0] = -dir[0];
-			dir[1] = -dir[1];
-			dir[2] = -dir[2];
-		}
-
-		_result[0] = dir[0];
-		_result[1] = dir[1];
-		_result[2] = dir[2];
-	}
+	void randUnitHemisphere(float _result[3], Ty* _rng, const float _normal[3]);
 
 	/// Sampling with Hammersley and Halton Points
 	/// http://www.cse.cuhk.edu.hk/~ttwong/papers/udpoint/udpoints.html
 	///
-	inline void generateSphereHammersley(void* _data, uint32_t _stride, uint32_t _num, float _scale = 1.0f)
-	{
-		uint8_t* data = (uint8_t*)_data;
-
-		for (uint32_t ii = 0; ii < _num; ii++)
-		{
-			float tt = 0.0f;
-			float pp = 0.5;
-			for (uint32_t jj = ii; jj; jj >>= 1)
-			{
-				tt += (jj & 1) ? pp : 0.0f;
-				pp *= 0.5f;
-			}
-
-			tt = 2.0f * tt - 1.0f;
-
-			const float phi    = (ii + 0.5f) / _num;
-			const float phirad =  phi * 2.0f * pi;
-			const float st     = fsqrt(1.0f-tt*tt) * _scale;
-
-			float* xyz = (float*)data;
-			data += _stride;
-
-			xyz[0] = st * fcos(phirad);
-			xyz[1] = st * fsin(phirad);
-			xyz[2] = tt * _scale;
-		}
-	}
+	void generateSphereHammersley(void* _data, uint32_t _stride, uint32_t _num, float _scale = 1.0f);
 
 	/// Fisher-Yates shuffle.
 	template<typename Rng, typename Ty>
-	inline void shuffle(Rng* _rng, Ty* _array, uint32_t _num)
-	{
-		BX_CHECK(_num != 0, "Number of elements can't be 0!");
-
-		for (uint32_t ii = 0, num = _num-1; ii < num; ++ii)
-		{
-			uint32_t jj = ii + 1 + _rng->gen() % (num - ii);
-			bx::xchg(_array[ii], _array[jj]);
-		}
-	}
+	void shuffle(Rng* _rng, Ty* _array, uint32_t _num);
 
 } // namespace bx
 
+#include "rng.inl"
+
 #endif // BX_RNG_H_HEADER_GUARD
diff --git a/include/bx/rng.inl b/include/bx/rng.inl
new file mode 100644
index 0000000..02a4d21
--- /dev/null
+++ b/include/bx/rng.inl
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+ */
+
+#ifndef BX_RNG_H_HEADER_GUARD
+#	error "Must be included from bx/rng.h!"
+#endif // BX_RNG_H_HEADER_GUARD
+
+#include "bx.h"
+#include "fpumath.h"
+#include "uint32_t.h"
+
+namespace bx
+{
+	inline RngMwc::RngMwc(uint32_t _z, uint32_t _w)
+		: m_z(_z)
+		, m_w(_w)
+	{
+	}
+
+	inline void RngMwc::reset(uint32_t _z, uint32_t _w)
+	{
+		m_z = _z;
+		m_w = _w;
+	}
+
+	inline uint32_t RngMwc::gen()
+	{
+		m_z = 36969*(m_z&65535)+(m_z>>16);
+		m_w = 18000*(m_w&65535)+(m_w>>16);
+		return (m_z<<16)+m_w;
+	}
+
+	inline RngFib::RngFib(uint32_t _a, uint32_t _b)
+		: m_a(_a)
+		, m_b(_b)
+	{
+	}
+
+	inline void RngFib::reset(uint32_t _a, uint32_t _b)
+	{
+		m_a = _a;
+		m_b = _b;
+	}
+
+	inline uint32_t RngFib::gen()
+	{
+		m_b = m_a+m_b;
+		m_a = m_b-m_a;
+		return m_a;
+	}
+
+	inline RngShr3::RngShr3(uint32_t _jsr)
+		: m_jsr(_jsr)
+	{
+	}
+
+	inline void RngShr3::reset(uint32_t _jsr)
+	{
+		m_jsr = _jsr;
+	}
+
+	inline uint32_t RngShr3::gen()
+	{
+		m_jsr ^= m_jsr<<17;
+		m_jsr ^= m_jsr>>13;
+		m_jsr ^= m_jsr<<5;
+		return m_jsr;
+	}
+
+	template <typename Rng>
+	inline float frnd(Rng* _rng)
+	{
+		uint32_t rnd = _rng->gen() & UINT16_MAX;
+		return float(rnd) * 1.0f/float(UINT16_MAX);
+	}
+
+	template <typename Rng>
+	inline float frndh(Rng* _rng)
+	{
+		return 2.0f * bx::frnd(_rng) - 1.0f;
+	}
+
+	template <typename Rng>
+	inline void randUnitCircle(float _result[3], Rng* _rng)
+	{
+		const float angle = frnd(_rng) * pi * 2.0f;
+
+		_result[0] = fcos(angle);
+		_result[1] = 0.0f;
+		_result[2] = fsin(angle);
+	}
+
+	template <typename Rng>
+	inline void randUnitSphere(float _result[3], Rng* _rng)
+	{
+		const float rand0  = frnd(_rng) * 2.0f - 1.0f;
+		const float rand1  = frnd(_rng) * pi * 2.0f;
+		const float sqrtf1 = fsqrt(1.0f - rand0*rand0);
+
+		_result[0] = sqrtf1 * fcos(rand1);
+		_result[1] = sqrtf1 * fsin(rand1);
+		_result[2] = rand0;
+	}
+
+	template <typename Ty>
+	inline void randUnitHemisphere(float _result[3], Ty* _rng, const float _normal[3])
+	{
+		float dir[3];
+		randUnitSphere(dir, _rng);
+
+		float DdotN = dir[0]*_normal[0]
+					+ dir[1]*_normal[1]
+					+ dir[2]*_normal[2]
+					;
+
+		if (0.0f > DdotN)
+		{
+			dir[0] = -dir[0];
+			dir[1] = -dir[1];
+			dir[2] = -dir[2];
+		}
+
+		_result[0] = dir[0];
+		_result[1] = dir[1];
+		_result[2] = dir[2];
+	}
+
+	inline void generateSphereHammersley(void* _data, uint32_t _stride, uint32_t _num, float _scale)
+	{
+		uint8_t* data = (uint8_t*)_data;
+
+		for (uint32_t ii = 0; ii < _num; ii++)
+		{
+			float tt = 0.0f;
+			float pp = 0.5;
+			for (uint32_t jj = ii; jj; jj >>= 1)
+			{
+				tt += (jj & 1) ? pp : 0.0f;
+				pp *= 0.5f;
+			}
+
+			tt = 2.0f * tt - 1.0f;
+
+			const float phi    = (ii + 0.5f) / _num;
+			const float phirad =  phi * 2.0f * pi;
+			const float st     = fsqrt(1.0f-tt*tt) * _scale;
+
+			float* xyz = (float*)data;
+			data += _stride;
+
+			xyz[0] = st * fcos(phirad);
+			xyz[1] = st * fsin(phirad);
+			xyz[2] = tt * _scale;
+		}
+	}
+
+	template<typename Rng, typename Ty>
+	inline void shuffle(Rng* _rng, Ty* _array, uint32_t _num)
+	{
+		BX_CHECK(_num != 0, "Number of elements can't be 0!");
+
+		for (uint32_t ii = 0, num = _num-1; ii < num; ++ii)
+		{
+			uint32_t jj = ii + 1 + _rng->gen() % (num - ii);
+			bx::xchg(_array[ii], _array[jj]);
+		}
+	}
+
+} // namespace bx