From e474666a5564aaef68649ba35aee9e486aa149e9 Mon Sep 17 00:00:00 2001
From: KostasAAA <kostas.anagnostou@gmail.com>
Date: Sun, 4 Mar 2018 23:02:42 +0000
Subject: [PATCH] Initial commit of gpu occlusion culling with multidraw
 indirect example (#1344)

---
 .../37-gpudrivenrendering/cs_downscaleHiZ.sc  |   44 +
 .../37-gpudrivenrendering/cs_occludeProps.sc  |  106 ++
 .../cs_streamCompaction.sc                    |  122 ++
 .../fs_instancedIndirectRendering.sc          |   24 +
 .../fs_renderOcclusion.sc                     |   13 +
 .../gpudrivenrendering.cpp                    | 1229 +++++++++++++++++
 examples/37-gpudrivenrendering/makefile       |   10 +
 examples/37-gpudrivenrendering/varying.def.sc |    7 +
 .../varying_pos_tex0.def.sc                   |    4 +
 .../vs_instancedIndirectRendering.sc          |   24 +
 .../vs_renderOcclusion.sc                     |   20 +
 11 files changed, 1603 insertions(+)
 create mode 100644 examples/37-gpudrivenrendering/cs_downscaleHiZ.sc
 create mode 100644 examples/37-gpudrivenrendering/cs_occludeProps.sc
 create mode 100644 examples/37-gpudrivenrendering/cs_streamCompaction.sc
 create mode 100644 examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc
 create mode 100644 examples/37-gpudrivenrendering/fs_renderOcclusion.sc
 create mode 100644 examples/37-gpudrivenrendering/gpudrivenrendering.cpp
 create mode 100644 examples/37-gpudrivenrendering/makefile
 create mode 100644 examples/37-gpudrivenrendering/varying.def.sc
 create mode 100644 examples/37-gpudrivenrendering/varying_pos_tex0.def.sc
 create mode 100644 examples/37-gpudrivenrendering/vs_instancedIndirectRendering.sc
 create mode 100644 examples/37-gpudrivenrendering/vs_renderOcclusion.sc

diff --git a/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc b/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc
new file mode 100644
index 000000000..06d42efc4
--- /dev/null
+++ b/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh"
+
+IMAGE2D_RO(s_texOcclusionDepthIn, r32f, 0);
+IMAGE2D_WR(s_texOcclusionDepthOut, r32f, 1);
+
+uniform vec4 u_inputRTSize;
+
+NUM_THREADS(16, 16, 1)
+void main()
+{
+	//this shader can be used to both copy a mip over to the output and downscale it. 
+	
+	ivec2 coord = gl_GlobalInvocationID.xy;
+		
+	if (all(coord.xy < u_inputRTSize.xy))
+	{	
+		float maxDepth = 1.0;
+		
+		if ( u_inputRTSize.z > 1)
+		{
+			vec4 depths = vec4( imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy ).r,
+								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,0) ).r,
+								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(0,1)).r,
+								imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,1)).r
+								);
+
+			//find and return max depth
+			maxDepth = max(max(depths.x, depths.y), max(depths.z, depths.w));
+		}
+		else
+		{
+			//do not downscale, just copy the value over to the output rendertarget
+			maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy ).r;
+		}
+			
+		imageStore(s_texOcclusionDepthOut, coord, vec4(maxDepth,0,0,1) );
+	}
+}
\ No newline at end of file
diff --git a/examples/37-gpudrivenrendering/cs_occludeProps.sc b/examples/37-gpudrivenrendering/cs_occludeProps.sc
new file mode 100644
index 000000000..b9fa40171
--- /dev/null
+++ b/examples/37-gpudrivenrendering/cs_occludeProps.sc
@@ -0,0 +1,106 @@
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh"
+
+SAMPLER2D(s_texOcclusionDepth, 0);
+
+BUFFER_RO(instanceDataIn, vec4, 1);
+BUFFER_RW(drawcallInstanceCount, uint, 2);
+BUFFER_WR(instancePredicates, bool, 3);
+
+uniform vec4 u_inputRTSize;
+uniform vec4 u_cullingConfig;
+ 
+NUM_THREADS(64, 1, 1)
+void main()
+{
+	bool predicate = false;
+	
+	//make sure that we not processing more instances than available
+	if (gl_GlobalInvocationID.x < (int)u_cullingConfig.x)
+	{
+		//get the bounding box for this instance
+		vec4 bboxMin = instanceDataIn[2 * gl_GlobalInvocationID.x] ;
+		vec3 bboxMax = instanceDataIn[2 * gl_GlobalInvocationID.x + 1].xyz;
+		
+		int drawcallID = bboxMin.w;
+	
+		//Adapted from http://blog.selfshadow.com/publications/practical-visibility/
+		vec3 bboxSize = bboxMax.xyz - bboxMin.xyz;
+
+		vec3 boxCorners[] = { 	bboxMin.xyz,
+								bboxMin.xyz + vec3(bboxSize.x,0,0),
+								bboxMin.xyz + vec3(0, bboxSize.y,0),
+								bboxMin.xyz + vec3(0, 0, bboxSize.z),
+								bboxMin.xyz + vec3(bboxSize.xy,0),
+								bboxMin.xyz + vec3(0, bboxSize.yz),
+								bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z),
+								bboxMin.xyz + bboxSize.xyz
+							 };
+		float minZ = 1;
+		vec2 minXY = vec2(1,1);
+		vec2 maxXY = vec2(0,0);
+
+		[unroll]
+		for (int i = 0; i < 8; i++)
+		{
+			//transform World space aaBox to NDC
+			vec4 clipPos = mul( u_viewProj, vec4(boxCorners[i], 1) );
+
+			clipPos.z = max(clipPos.z, 0);
+
+			clipPos.xyz = clipPos.xyz / clipPos.w;
+
+			clipPos.xy = clamp(clipPos.xy, -1, 1);
+			clipPos.xy = clipPos.xy * vec2(0.5, -0.5) + vec2(0.5, 0.5);
+
+			minXY = min(clipPos.xy, minXY);
+			maxXY = max(clipPos.xy, maxXY);
+
+			minZ = saturate(min(minZ, clipPos.z));		
+		}
+
+		vec4 boxUVs = vec4(minXY, maxXY);
+
+		// Calculate hi-Z buffer mip
+		ivec2 size = (maxXY - minXY) * u_inputRTSize.xy;
+		float mip = ceil(log2(max(size.x, size.y)));
+
+		mip = clamp(mip, 0, u_cullingConfig.z);
+
+		// Texel footprint for the lower (finer-grained) level
+		float  level_lower = max(mip - 1, 0);
+		vec2 scale = exp2(-level_lower);
+		vec2 a = floor(boxUVs.xy*scale);
+		vec2 b = ceil(boxUVs.zw*scale);
+		vec2 dims = b - a;
+
+		// Use the lower level if we only touch <= 2 texels in both dimensions
+		if (dims.x <= 2 && dims.y <= 2)
+			mip = level_lower;
+
+		//load depths from high z buffer
+		vec4 depth = { 	texture2DLod(s_texOcclusionDepth, boxUVs.xy, mip).x,
+						texture2DLod(s_texOcclusionDepth, boxUVs.zy, mip).x,
+						texture2DLod(s_texOcclusionDepth, boxUVs.xw, mip).x,
+						texture2DLod(s_texOcclusionDepth, boxUVs.zw, mip).x,
+					};		
+		
+		//find the max depth
+		float maxDepth = max( max(depth.x, depth.y), max(depth.z, depth.w) );
+
+		if ( minZ <= maxDepth )
+		{
+			predicate = true;
+
+			//increase instance count for this particular prop type
+			InterlockedAdd( drawcallInstanceCount[ drawcallID ], 1);			
+		}
+	}
+
+	instancePredicates[gl_GlobalInvocationID.x] = predicate;
+}
diff --git a/examples/37-gpudrivenrendering/cs_streamCompaction.sc b/examples/37-gpudrivenrendering/cs_streamCompaction.sc
new file mode 100644
index 000000000..b1791cca1
--- /dev/null
+++ b/examples/37-gpudrivenrendering/cs_streamCompaction.sc
@@ -0,0 +1,122 @@
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bgfx_compute.sh"
+
+//the per drawcall data that is constant (noof indices and offsets to vertex/index buffers)
+BUFFER_RO(drawcallConstData, uint, 0);
+//instance data for all instances (pre culling)
+BUFFER_RO(instanceDataIn, vec4, 1);
+//per instance visibility (output of culling pass)
+BUFFER_RO(instancePredicates, bool, 2);
+
+//how many instances per drawcall
+BUFFER_RW(drawcallInstanceCount, uint, 3);
+//drawcall data that will drive drawIndirect
+BUFFER_RW(drawcallData, uvec4, 4);
+//culled instance data
+BUFFER_WR(instanceDataOut, vec4, 5);
+
+uniform vec4 u_cullingConfig;
+ 
+// Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris
+groupshared uint temp[2048];
+
+NUM_THREADS(1024, 1, 1)
+void main()
+{
+	int tID = gl_GlobalInvocationID.x;
+	int NoofInstancesPowOf2 = u_cullingConfig.y;
+	int NoofDrawcalls = u_cullingConfig.w;
+
+	int offset = 1;
+	temp[2 * tID] = instancePredicates[2 * tID]; // load input into shared memory
+	temp[2 * tID + 1] = instancePredicates[2 * tID + 1];
+
+	int d;
+		
+	//perform reduction
+	for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1) 
+	{
+		GroupMemoryBarrierWithGroupSync();
+
+		if (tID < d)
+		{
+			int ai = offset * (2 * tID + 1) - 1;
+			int bi = offset * (2 * tID + 2) - 1;
+			temp[bi] += temp[ai];
+		}
+		offset *= 2;
+	}
+
+	// clear the last element
+	if (tID == 0)
+		temp[NoofInstancesPowOf2 - 1] = 0;
+
+	//perform downsweep and build scan
+	for ( d = 1; d < NoofInstancesPowOf2; d *= 2)
+	{
+		offset >>= 1;
+
+		GroupMemoryBarrierWithGroupSync();
+
+		if (tID < d)
+		{
+			int ai = offset * (2 * tID + 1) - 1;
+			int bi = offset * (2 * tID + 2) - 1;
+			int t = temp[ai];
+			temp[ai] = temp[bi];
+			temp[bi] += t;
+		}
+	}
+
+	GroupMemoryBarrierWithGroupSync();
+
+	int index = 2 * tID;
+
+	//scatter results
+	if (instancePredicates[index] != 0)
+	{	
+		instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
+		instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
+		instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
+		instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
+	}
+
+	index = 2 * tID + 1;
+
+	if (instancePredicates[index] != 0)
+	{
+		instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];			
+		instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
+		instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
+		instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];	
+	}
+  
+	if (tID == 0)
+	{
+		uint startInstance = 0;
+		
+		//copy data to indirect buffer, could possible be done in a different compute shader
+		for (int k = 0; k < NoofDrawcalls; k++)
+		{				
+			drawIndexedIndirect(
+				drawcallData, 
+				k, 
+				drawcallConstData[ k * 3 ], 			//number of indices
+				drawcallInstanceCount[k], 				//number of instances
+				drawcallConstData[ k * 3 + 1 ],			//offset into the index buffer 
+				drawcallConstData[ k * 3 + 2 ],			//offset into the vertex buffer 
+				startInstance							//offset into the instance buffer
+				);
+
+			startInstance += drawcallInstanceCount[k];
+										
+			drawcallInstanceCount[k] = 0;
+		}
+	}
+ 
+}
\ No newline at end of file
diff --git a/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc b/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc
new file mode 100644
index 000000000..08033ac2f
--- /dev/null
+++ b/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc
@@ -0,0 +1,24 @@
+$input v_materialID
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+uniform vec4 u_colour[50];
+
+void main()
+{
+	vec4 colour = u_colour[v_materialID.x];
+	
+	if ( colour.w < 1.0f )
+	{
+		//render dithered alpha
+		if ( (gl_FragCoord.x % 2) == (gl_FragCoord.y % 2) )
+			discard;
+	}
+	
+	gl_FragColor = vec4( colour.xyz,1 );
+}
diff --git a/examples/37-gpudrivenrendering/fs_renderOcclusion.sc b/examples/37-gpudrivenrendering/fs_renderOcclusion.sc
new file mode 100644
index 000000000..0620d2454
--- /dev/null
+++ b/examples/37-gpudrivenrendering/fs_renderOcclusion.sc
@@ -0,0 +1,13 @@
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+//dummy shader for occlusion buffer pass until bgfx supports rendering with null shader 
+void main()
+{
+	gl_FragColor = vec4(0, 0, 0, 0);
+}
diff --git a/examples/37-gpudrivenrendering/gpudrivenrendering.cpp b/examples/37-gpudrivenrendering/gpudrivenrendering.cpp
new file mode 100644
index 000000000..ab746810f
--- /dev/null
+++ b/examples/37-gpudrivenrendering/gpudrivenrendering.cpp
@@ -0,0 +1,1229 @@
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "common.h"
+#include "bgfx_utils.h"
+#include "imgui/imgui.h"
+#include <time.h>
+
+namespace
+{
+
+#define RENDER_PASS_HIZ_ID						0
+#define RENDER_PASS_HIZ_DOWNSCALE_ID			1
+#define RENDER_PASS_OCCLUDE_PROPS_ID			2
+#define RENDER_PASS_COMPACT_STREAM_ID			3
+#define RENDER_PASS_MAIN_ID						4
+
+struct Camera
+{
+	Camera()
+	{
+		reset();
+	}
+
+	void reset()
+	{
+		m_target.curr[0] = 0.0f;
+		m_target.curr[1] = 0.0f;
+		m_target.curr[2] = 0.0f;
+		m_target.dest[0] = 0.0f;
+		m_target.dest[1] = 0.0f;
+		m_target.dest[2] = 0.0f;
+
+		m_pos.curr[0] = 55.0f;
+		m_pos.curr[1] = 20.0f;
+		m_pos.curr[2] = 65.0f;
+		m_pos.dest[0] = 55.0f;
+		m_pos.dest[1] = 20.0f;
+		m_pos.dest[2] = 65.0f;
+
+		m_orbit[0] = 0.0f;
+		m_orbit[1] = 0.0f;
+	}
+
+	void mtxLookAt(float* _outViewMtx)
+	{
+		bx::mtxLookAt(_outViewMtx, m_pos.curr, m_target.curr);
+	}
+
+	void orbit(float _dx, float _dy)
+	{
+		m_orbit[0] += _dx;
+		m_orbit[1] += _dy;
+	}
+
+	void dolly(float _dz)
+	{
+		const float cnear = 1.0f;
+		const float cfar = 100.0f;
+
+		const float toTarget[3] =
+		{
+			m_target.dest[0] - m_pos.dest[0],
+			m_target.dest[1] - m_pos.dest[1],
+			m_target.dest[2] - m_pos.dest[2],
+		};
+		const float toTargetLen = bx::vec3Length(toTarget);
+		const float invToTargetLen = 1.0f / (toTargetLen + FLT_MIN);
+		const float toTargetNorm[3] =
+		{
+			toTarget[0] * invToTargetLen,
+			toTarget[1] * invToTargetLen,
+			toTarget[2] * invToTargetLen,
+		};
+
+		float delta = toTargetLen*_dz;
+		float newLen = toTargetLen + delta;
+		if ((cnear < newLen || _dz < 0.0f)
+			&& (newLen < cfar || _dz > 0.0f))
+		{
+			m_pos.dest[0] += toTargetNorm[0] * delta;
+			m_pos.dest[1] += toTargetNorm[1] * delta;
+			m_pos.dest[2] += toTargetNorm[2] * delta;
+		}
+	}
+
+	void consumeOrbit(float _amount)
+	{
+		float consume[2];
+		consume[0] = m_orbit[0] * _amount;
+		consume[1] = m_orbit[1] * _amount;
+		m_orbit[0] -= consume[0];
+		m_orbit[1] -= consume[1];
+
+		const float toPos[3] =
+		{
+			m_pos.curr[0] - m_target.curr[0],
+			m_pos.curr[1] - m_target.curr[1],
+			m_pos.curr[2] - m_target.curr[2],
+		};
+		const float toPosLen = bx::vec3Length(toPos);
+		const float invToPosLen = 1.0f / (toPosLen + FLT_MIN);
+		const float toPosNorm[3] =
+		{
+			toPos[0] * invToPosLen,
+			toPos[1] * invToPosLen,
+			toPos[2] * invToPosLen,
+		};
+
+		float ll[2];
+		latLongFromVec(ll[0], ll[1], toPosNorm);
+		ll[0] += consume[0];
+		ll[1] -= consume[1];
+		ll[1] = bx::clamp(ll[1], 0.02f, 0.98f);
+
+		float tmp[3];
+		vecFromLatLong(tmp, ll[0], ll[1]);
+
+		float diff[3];
+		diff[0] = (tmp[0] - toPosNorm[0])*toPosLen;
+		diff[1] = (tmp[1] - toPosNorm[1])*toPosLen;
+		diff[2] = (tmp[2] - toPosNorm[2])*toPosLen;
+
+		m_pos.curr[0] += diff[0];
+		m_pos.curr[1] += diff[1];
+		m_pos.curr[2] += diff[2];
+		m_pos.dest[0] += diff[0];
+		m_pos.dest[1] += diff[1];
+		m_pos.dest[2] += diff[2];
+	}
+
+	void update(float _dt)
+	{
+		const float amount = bx::min(_dt / 0.12f, 1.0f);
+
+		consumeOrbit(amount);
+
+		m_target.curr[0] = bx::lerp(m_target.curr[0], m_target.dest[0], amount);
+		m_target.curr[1] = bx::lerp(m_target.curr[1], m_target.dest[1], amount);
+		m_target.curr[2] = bx::lerp(m_target.curr[2], m_target.dest[2], amount);
+		m_pos.curr[0] = bx::lerp(m_pos.curr[0], m_pos.dest[0], amount);
+		m_pos.curr[1] = bx::lerp(m_pos.curr[1], m_pos.dest[1], amount);
+		m_pos.curr[2] = bx::lerp(m_pos.curr[2], m_pos.dest[2], amount);
+	}
+
+	void envViewMtx(float* _mtx)
+	{
+		const float toTarget[3] =
+		{
+			m_target.curr[0] - m_pos.curr[0],
+			m_target.curr[1] - m_pos.curr[1],
+			m_target.curr[2] - m_pos.curr[2],
+		};
+
+		const float toTargetLen = bx::vec3Length(toTarget);
+		const float invToTargetLen = 1.0f / (toTargetLen + FLT_MIN);
+		const float toTargetNorm[3] =
+		{
+			toTarget[0] * invToTargetLen,
+			toTarget[1] * invToTargetLen,
+			toTarget[2] * invToTargetLen,
+		};
+
+		float tmp[3];
+		const float fakeUp[3] = { 0.0f, 1.0f, 0.0f };
+
+		float right[3];
+		bx::vec3Cross(tmp, fakeUp, toTargetNorm);
+		bx::vec3Norm(right, tmp);
+
+		float up[3];
+		bx::vec3Cross(tmp, toTargetNorm, right);
+		bx::vec3Norm(up, tmp);
+
+		_mtx[0] = right[0];
+		_mtx[1] = right[1];
+		_mtx[2] = right[2];
+		_mtx[3] = 0.0f;
+		_mtx[4] = up[0];
+		_mtx[5] = up[1];
+		_mtx[6] = up[2];
+		_mtx[7] = 0.0f;
+		_mtx[8] = toTargetNorm[0];
+		_mtx[9] = toTargetNorm[1];
+		_mtx[10] = toTargetNorm[2];
+		_mtx[11] = 0.0f;
+		_mtx[12] = 0.0f;
+		_mtx[13] = 0.0f;
+		_mtx[14] = 0.0f;
+		_mtx[15] = 1.0f;
+	}
+
+	static inline void vecFromLatLong(float _vec[3], float _u, float _v)
+	{
+		const float phi = _u * 2.0f*bx::kPi;
+		const float theta = _v * bx::kPi;
+
+		const float st = bx::sin(theta);
+		const float sp = bx::sin(phi);
+		const float ct = bx::cos(theta);
+		const float cp = bx::cos(phi);
+
+		_vec[0] = -st*sp;
+		_vec[1] = ct;
+		_vec[2] = -st*cp;
+	}
+
+	static inline void latLongFromVec(float& _u, float& _v, const float _vec[3])
+	{
+		const float phi = bx::atan2(_vec[0], _vec[2]);
+		const float theta = bx::acos(_vec[1]);
+
+		_u = (bx::kPi + phi)*bx::kInvPi*0.5f;
+		_v = theta*bx::kInvPi;
+	}
+
+	struct Interp3f
+	{
+		float curr[3];
+		float dest[3];
+	};
+
+	Interp3f m_target;
+	Interp3f m_pos;
+	float m_orbit[2];
+};
+
+struct Mouse
+{
+	Mouse()
+		: m_dx(0.0f)
+		, m_dy(0.0f)
+		, m_prevMx(0.0f)
+		, m_prevMy(0.0f)
+		, m_scroll(0)
+		, m_scrollPrev(0)
+	{
+	}
+
+	void update(float _mx, float _my, int32_t _mz, uint32_t _width, uint32_t _height)
+	{
+		const float widthf = float(int32_t(_width));
+		const float heightf = float(int32_t(_height));
+
+		// Delta movement.
+		m_dx = float(_mx - m_prevMx) / widthf;
+		m_dy = float(_my - m_prevMy) / heightf;
+
+		m_prevMx = _mx;
+		m_prevMy = _my;
+
+		// Scroll.
+		m_scroll = _mz - m_scrollPrev;
+		m_scrollPrev = _mz;
+	}
+
+	float m_dx; // Screen space.
+	float m_dy;
+	float m_prevMx;
+	float m_prevMy;
+	int32_t m_scroll;
+	int32_t m_scrollPrev;
+};
+
+struct PosVertex
+{
+	float m_x;
+	float m_y;
+	float m_z;
+
+	static void init()
+	{
+		ms_decl
+			.begin()
+			.add(bgfx::Attrib::Position, 3, bgfx::AttribType::Float)
+			.end();
+	};
+
+	static bgfx::VertexDecl ms_decl;
+};
+
+bgfx::VertexDecl PosVertex::ms_decl;
+
+static PosVertex s_cubeVertices[8] =
+{
+	{-0.5f,  0.5f,  0.5f},
+	{ 0.5f,  0.5f,  0.5f},
+	{-0.5f, -0.5f,  0.5f},
+	{ 0.5f, -0.5f,  0.5f},
+	{-0.5f,  0.5f, -0.5f},
+	{ 0.5f,  0.5f, -0.5f},
+	{-0.5f, -0.5f, -0.5f},
+	{ 0.5f, -0.5f, -0.5f},
+};
+
+static const uint16_t s_cubeIndices[36] =
+{
+	0, 1, 2, // 0
+	1, 3, 2,
+	4, 6, 5, // 2
+	5, 6, 7,
+	0, 2, 4, // 4
+	4, 2, 6,
+	1, 5, 3, // 6
+	5, 7, 3,
+	0, 4, 1, // 8
+	4, 5, 1,
+	2, 3, 6, // 10
+	6, 3, 7,
+};
+
+struct RenderPass
+{
+	enum Enum
+	{
+		Occlusion = 1 << 0,
+		MainPass = 1 << 1,
+		All = Occlusion | MainPass
+	};
+};
+
+// All the per-instance data we store 
+struct InstanceData
+{
+	float m_world[16];
+	float m_bboxMin[4];
+	float m_bboxMax[4];
+};
+
+//A description of each prop
+struct Prop
+{
+	PosVertex*	m_vertices;
+	uint16_t*	m_indices;
+	InstanceData* m_instances;
+	bgfx::VertexBufferHandle m_vertexbufferHandle;
+	bgfx::IndexBufferHandle  m_indexbufferHandle;
+	uint16_t	m_noofVertices;
+	uint16_t	m_noofIndices;
+	uint16_t	m_noofInstances;
+	uint16_t	m_materialID;
+	RenderPass::Enum m_renderPass;
+};
+
+//A simplistic material, comprised of a colour only
+struct Material
+{
+	float m_colour[4];
+};
+
+//helpers to make setting vectors a bit tidier
+inline void setVector3(float* dest, float x, float y, float z)
+{
+	dest[0] = x;
+	dest[1] = y;
+	dest[2] = z;
+}
+
+inline void setVector4(float* dest, float x, float y, float z, float w)
+{
+	dest[0] = x;
+	dest[1] = y;
+	dest[2] = z;
+	dest[3] = w;
+}
+
+//Sets up a prop
+void createCubeMesh(Prop& prop)
+{
+	prop.m_noofVertices = 8;
+	prop.m_noofIndices = 36;
+	prop.m_vertices = new PosVertex[prop.m_noofVertices];
+	prop.m_indices = new uint16_t[prop.m_noofIndices];
+
+	memcpy(prop.m_vertices, s_cubeVertices, prop.m_noofVertices * PosVertex::ms_decl.getStride());
+	memcpy(prop.m_indices, s_cubeIndices, prop.m_noofIndices * sizeof(uint16_t));
+
+	prop.m_vertexbufferHandle = bgfx::createVertexBuffer(
+		bgfx::makeRef(prop.m_vertices, prop.m_noofVertices * PosVertex::ms_decl.getStride()),
+		PosVertex::ms_decl);
+
+	prop.m_indexbufferHandle = bgfx::createIndexBuffer(bgfx::makeRef(prop.m_indices, prop.m_noofIndices * sizeof(uint16_t)));
+}
+
+//returns a random number between 0 and 1
+float rand01()
+{
+	return rand() / (float)RAND_MAX;
+}
+
+class GPUDrivenRendering : public entry::AppI
+{
+public:
+	GPUDrivenRendering(const char* _name, const char* _description)
+		: entry::AppI(_name, _description)
+	{
+	}
+
+	void init(int32_t _argc, const char* const* _argv, uint32_t _width, uint32_t _height) override
+	{
+		Args args(_argc, _argv);
+
+		m_width  = _width;
+		m_height = _height;
+
+		//find largest pow of two dims less than backbuffer size
+		m_hiZwidth = (uint32_t)pow(2, floor(log2(m_width)));
+		m_hiZheight = (uint32_t)pow(2, floor(log2(m_height)));
+
+		m_debug  = BGFX_DEBUG_TEXT;
+		m_reset  = BGFX_RESET_VSYNC;
+
+		bgfx::init(args.m_type, args.m_pciId);
+		bgfx::reset(m_width, m_height, m_reset);
+
+		// Enable debug text.
+		bgfx::setDebug(m_debug);
+
+		//create props
+		{
+			m_totalInstancesCount = 0;
+
+			// Create vertex stream declaration.
+			PosVertex::init();
+
+			m_noofProps = 0;
+
+			m_props = new Prop[s_maxNoofProps];
+
+			//first create space for some materials
+			m_materials = new Material[s_maxNoofProps];
+			m_noofMaterials = 0;
+
+			//add a ground plane
+			{
+				Prop& prop = m_props[m_noofProps++];
+
+				prop.m_renderPass = RenderPass::MainPass;
+
+				createCubeMesh(prop);
+
+				prop.m_noofInstances = 1;
+				prop.m_instances = new InstanceData[prop.m_noofInstances];
+
+				bx::mtxSRT(prop.m_instances->m_world
+					, 100.0f, 0.1f, 100.0f
+					, 0.0f, 0.0f, 0.0f
+					, 0.0f, 0.0f, 0.0f
+				);
+
+				float temp[4];
+				setVector4(temp, -0.5f, -0.5f, -0.5f, 1.0f);
+				bx::vec4MulMtx(prop.m_instances->m_bboxMin, temp, prop.m_instances->m_world);
+
+				setVector4(temp, 0.5f, 0.5f, 0.5f, 1.0f);
+				bx::vec4MulMtx(prop.m_instances->m_bboxMax, temp, prop.m_instances->m_world);
+
+				prop.m_materialID = m_noofMaterials;
+				setVector4(m_materials[prop.m_materialID].m_colour, 0.0f, 0.6f, 0.0f, 1.0f);
+				m_noofMaterials++;
+
+				m_totalInstancesCount += prop.m_noofInstances;
+			}
+
+			//add a few instances of the occluding mesh
+			{
+				Prop& prop = m_props[m_noofProps++];
+
+				prop.m_renderPass = RenderPass::All;
+
+				//create prop
+				createCubeMesh(prop);
+
+				//add a few instances of the wall mesh
+				prop.m_noofInstances = 25;
+				prop.m_instances = new InstanceData[prop.m_noofInstances];
+				for (int i = 0; i < prop.m_noofInstances; i++)
+				{
+					//calculate world position
+					bx::mtxSRT(prop.m_instances[i].m_world
+						, 40.0f, 10.0f, 0.1f
+						, 0.0f, ( rand01() * 120.0f - 60.0f) * 3.1459f / 180.0f, 0.0f
+						, rand01() * 100.0f - 50.0f, 5.0f, rand01() * 100.0f - 50.0f
+					);
+
+					//calculate bounding box and transform to world space
+					float temp[4];
+					setVector4(temp, -0.5f, -0.5f, -0.5f, 1.0f);
+					bx::vec4MulMtx(prop.m_instances[i].m_bboxMin, temp, prop.m_instances[i].m_world );
+
+					setVector4(temp, 0.5f, 0.5f, 0.5f, 1.0f);
+					bx::vec4MulMtx(prop.m_instances[i].m_bboxMax, temp, prop.m_instances[i].m_world );
+				}
+
+				//set the material ID. Will be used in the shader to select the material
+				prop.m_materialID = m_noofMaterials;
+
+				//add a "material" for this prop
+				setVector4(m_materials[prop.m_materialID].m_colour, 0.0f, 0.0f, 1.0f, 0.0f);
+				m_noofMaterials++;
+
+				m_totalInstancesCount += prop.m_noofInstances;
+			}
+
+			//add a few "regular" props
+			{
+				//add cubes
+				{
+					Prop& prop = m_props[m_noofProps++];
+
+					prop.m_renderPass = RenderPass::MainPass;
+
+					createCubeMesh(prop);
+
+					prop.m_noofInstances = 200;
+					prop.m_instances = new InstanceData[prop.m_noofInstances];
+					for (int i = 0; i < prop.m_noofInstances; i++)
+					{
+						bx::mtxSRT(prop.m_instances[i].m_world
+							, 2.0f, 2.0f, 2.0f
+							, 0.0f, 0.0f, 0.0f
+							, rand01() * 100.0f - 50.0f, 1.0f, rand01() * 100.0f - 50.0f
+						);
+
+						float temp[4];
+						setVector4(temp, -0.5f, -0.5f, -0.5f, 1.0f);
+						bx::vec4MulMtx(prop.m_instances[i].m_bboxMin, temp, prop.m_instances[i].m_world);
+
+						setVector4(temp, 0.5f, 0.5f, 0.5f, 1.0f);
+						bx::vec4MulMtx(prop.m_instances[i].m_bboxMax, temp, prop.m_instances[i].m_world);
+					}
+
+					prop.m_materialID = m_noofMaterials;
+					setVector4(m_materials[prop.m_materialID].m_colour, 1.0f, 1.0f, 0.0f, 1.0f);
+					m_noofMaterials++;
+
+					m_totalInstancesCount += prop.m_noofInstances;
+				}
+
+				//add some more cubes
+				{
+					Prop& prop = m_props[m_noofProps++];
+
+					prop.m_renderPass = RenderPass::MainPass;
+
+					createCubeMesh(prop);
+
+					prop.m_noofInstances = 300;
+					prop.m_instances = new InstanceData[prop.m_noofInstances];
+					for (int i = 0; i < prop.m_noofInstances; i++)
+					{
+						bx::mtxSRT(prop.m_instances[i].m_world
+							, 2.0f, 4.0f, 2.0f
+							, 0.0f, 0.0f, 0.0f
+							, rand01() * 100.0f - 50.0f, 2.0f, rand01() * 100.0f - 50.0f
+						);
+
+						float temp[4];
+						setVector4(temp, -0.5f, -0.5f, -0.5f, 1.0f);
+						bx::vec4MulMtx(prop.m_instances[i].m_bboxMin, temp, prop.m_instances[i].m_world );
+
+						setVector4(temp, 0.5f, 0.5f, 0.5f, 1.0f);
+						bx::vec4MulMtx(prop.m_instances[i].m_bboxMax, temp, prop.m_instances[i].m_world);
+					}
+
+					prop.m_materialID = m_noofMaterials;
+					setVector4(m_materials[prop.m_materialID].m_colour, 1.0f, 0.0f, 0.0f, 1.0f);
+					m_noofMaterials++;
+
+					m_totalInstancesCount += prop.m_noofInstances;
+				}
+			}
+		}
+
+		//Setup Occlusion pass
+		{
+			const uint32_t samplerFlags = 0
+				| BGFX_TEXTURE_RT
+				| BGFX_TEXTURE_MIN_POINT
+				| BGFX_TEXTURE_MAG_POINT
+				| BGFX_TEXTURE_MIP_POINT
+				| BGFX_TEXTURE_U_CLAMP
+				| BGFX_TEXTURE_V_CLAMP;
+
+			// Create buffers for the HiZ pass
+			m_hiZDepthBuffer = bgfx::createFrameBuffer(uint16_t(m_hiZwidth), uint16_t(m_hiZheight), bgfx::TextureFormat::D32, samplerFlags);
+
+			bgfx::TextureHandle buffer = bgfx::createTexture2D(uint16_t(m_hiZwidth), uint16_t(m_hiZheight), true, 1, bgfx::TextureFormat::R32F, BGFX_TEXTURE_COMPUTE_WRITE | samplerFlags);
+			m_hiZBuffer = bgfx::createFrameBuffer(1, &buffer, true);
+
+			//how many mip will the Hi Z buffer have?
+			m_noofHiZMips = (uint8_t)(1 + floor(log2(bx::uint32_max(m_hiZwidth, m_hiZheight))));
+
+			// Setup compute shader buffers
+
+			//The compute shader will write how many unoccluded instances per drawcall there are here
+			m_drawcallInstanceCounts = bgfx::createDynamicIndexBuffer(s_maxNoofProps, BGFX_BUFFER_INDEX32 | BGFX_BUFFER_COMPUTE_READ_WRITE);
+
+			//the compute shader will write the result of the occlusion test for each instance here
+			m_instancePredicates = bgfx::createDynamicIndexBuffer(s_maxNoofInstances, BGFX_BUFFER_COMPUTE_READ_WRITE);
+
+			//bounding box for each instance, will be fed to the compute shader to calculate occlusion
+			{
+				bgfx::VertexDecl computeVertexDecl;
+				computeVertexDecl.begin()
+					.add(bgfx::Attrib::TexCoord0, 4, bgfx::AttribType::Float)
+					.end();
+
+				//initialise the buffer with the bounding boxes of all instances
+				const int sizeOfBuffer = 2 * 4 * m_totalInstancesCount;
+				float* boundingBoxes = new float[sizeOfBuffer];
+
+				float* data = boundingBoxes;
+				for (uint16_t i = 0; i < m_noofProps; i++)
+				{
+					Prop& prop = m_props[i];
+
+					const uint32_t numInstances = prop.m_noofInstances;
+
+					for (uint32_t j = 0; j < numInstances; j++)
+					{
+						memcpy(data, prop.m_instances[j].m_bboxMin, 3 * sizeof(float));
+						data[3] = (float)i; // store the drawcall ID here to avoid creating a separate buffer
+						data += 4;
+
+						memcpy(data, prop.m_instances[j].m_bboxMax, 3 * sizeof(float));
+						data += 4;
+					}
+				}
+
+				const bgfx::Memory* mem = bgfx::makeRef(boundingBoxes, sizeof(float) * sizeOfBuffer);
+
+				m_instanceBoundingBoxes = bgfx::createDynamicVertexBuffer(mem, computeVertexDecl, BGFX_BUFFER_COMPUTE_READ);
+			}
+
+			//pre and post occlusion culling instance data buffers
+			{
+				bgfx::VertexDecl instanceBufferVertexDecl;
+				instanceBufferVertexDecl.begin()
+					.add(bgfx::Attrib::TexCoord0, 4, bgfx::AttribType::Float)
+					.add(bgfx::Attrib::TexCoord1, 4, bgfx::AttribType::Float)
+					.add(bgfx::Attrib::TexCoord2, 4, bgfx::AttribType::Float)
+					.add(bgfx::Attrib::TexCoord3, 4, bgfx::AttribType::Float)
+					.end();
+
+				//initialise the buffer with data for all instances
+				//Currently we only store a world matrix (16 floats)
+				const int sizeOfBuffer = 16 * m_totalInstancesCount;
+				float* instanceData = new float[sizeOfBuffer];
+
+				float* data = instanceData;
+				for (uint16_t i = 0; i < m_noofProps; i++)
+				{
+					Prop& prop = m_props[i];
+
+					const uint32_t numInstances = prop.m_noofInstances;
+
+					for (uint32_t j = 0; j < numInstances; j++)
+					{
+						memcpy(data, prop.m_instances[j].m_world, 16 * sizeof(float));
+						data[3] = (float)i; // store the drawcall ID here to avoid creating a separate buffer
+						data += 16;
+					}
+				}
+
+				const bgfx::Memory* mem = bgfx::makeRef(instanceData, sizeof(float) * sizeOfBuffer);
+
+				//pre occlusion buffer
+				m_instanceBuffer = bgfx::createVertexBuffer(mem, instanceBufferVertexDecl, BGFX_BUFFER_COMPUTE_READ);
+
+				//post occlusion buffer
+				m_culledInstanceBuffer = bgfx::createDynamicVertexBuffer(4 * m_totalInstancesCount, instanceBufferVertexDecl, BGFX_BUFFER_COMPUTE_WRITE);
+			}
+
+			//we use one "drawcall" per prop to render all its instances
+			m_indirectBuffer = bgfx::createIndirectBuffer(m_noofProps);
+
+			// Create programs from shaders for occlusion pass.
+			m_programOcclusionPass = loadProgram("vs_renderOcclusion", "fs_renderOcclusion");
+			m_programDownscaleHiZ = loadProgram("cs_downscaleHiZ", nullptr);
+			m_programOccludeProps = loadProgram("cs_occludeProps", nullptr);
+			m_programStreamCompaction = loadProgram("cs_streamCompaction", nullptr);
+
+			// Set view RENDER_PASS_HIZ_ID clear state.
+			bgfx::setViewClear(RENDER_PASS_HIZ_ID
+				, BGFX_CLEAR_DEPTH
+				, 0x0
+				, 1.0f
+				, 0
+			);
+		}
+
+		// Setup Main pass
+		{
+			// Set view 0 clear state.
+			bgfx::setViewClear(RENDER_PASS_MAIN_ID
+				, BGFX_CLEAR_COLOR | BGFX_CLEAR_DEPTH
+				, 0x303030ff
+				, 1.0f
+				, 0
+			);
+
+			// Create program from shaders.
+			m_programMainPass = loadProgram("vs_instancedIndirectRendering", "fs_instancedIndirectRendering");
+		}
+
+		// Create static vertex buffer for all props.
+
+		// Calculate how many vertices/indices the master buffers will need.
+		uint16_t totalNoofVertices = 0;
+		uint16_t totalNoofIndices = 0;
+		for (uint16_t i = 0; i < m_noofProps; i++)
+		{
+			Prop& prop = m_props[i];
+
+			totalNoofVertices += prop.m_noofVertices;
+			totalNoofIndices += prop.m_noofIndices;
+		}
+
+		//CPU data to fill the master buffers
+		m_allPropVerticesDataCPU = new PosVertex[totalNoofVertices];
+		m_allPropIndicesDataCPU = new uint16_t[totalNoofIndices];
+		m_indirectBufferDataCPU = new uint32_t[m_noofProps * 3];
+
+		// Copy data over to the master buffers
+		PosVertex* propVerticesData = m_allPropVerticesDataCPU;
+		uint16_t* propIndicesData = m_allPropIndicesDataCPU;
+
+		uint16_t vertexBufferOffset = 0;
+		uint16_t indexBufferOffset = 0;
+
+		for (uint16_t i = 0; i < m_noofProps; i++)
+		{
+			Prop& prop = m_props[i];
+
+			memcpy(propVerticesData, prop.m_vertices, prop.m_noofVertices * sizeof(PosVertex));
+			memcpy(propIndicesData, prop.m_indices, prop.m_noofIndices * sizeof(uint16_t));
+
+			propVerticesData += prop.m_noofVertices;
+			propIndicesData += prop.m_noofIndices;
+
+			m_indirectBufferDataCPU[ i * 3 ] = prop.m_noofIndices;
+			m_indirectBufferDataCPU[ i * 3 + 1] = indexBufferOffset;
+			m_indirectBufferDataCPU[ i * 3 + 2] = vertexBufferOffset;
+
+			indexBufferOffset += prop.m_noofIndices;
+			vertexBufferOffset += prop.m_noofVertices;
+		}
+
+		// Create master vertex buffer
+		m_allPropsVertexbufferHandle = bgfx::createVertexBuffer(
+					  bgfx::makeRef(m_allPropVerticesDataCPU, totalNoofVertices * PosVertex::ms_decl.getStride())
+					, PosVertex::ms_decl
+					);
+
+		// Create master index buffer.
+		m_allPropsIndexbufferHandle = bgfx::createIndexBuffer(
+					bgfx::makeRef(m_allPropIndicesDataCPU, totalNoofIndices * sizeof(uint16_t) )
+					);
+
+		// Create buffer with const drawcall data which will be copied to the indirect buffer later.  
+		m_indirectBufferData = bgfx::createIndexBuffer(
+			bgfx::makeRef(m_indirectBufferDataCPU, m_noofProps * 3 * sizeof(uint32_t)),
+			BGFX_BUFFER_COMPUTE_READ | BGFX_BUFFER_INDEX32
+		);
+
+		//create samplers
+		s_texOcclusionDepthIn = bgfx::createUniform("s_texOcclusionDepthIn", bgfx::UniformType::Int1);
+
+		//create uniforms
+		u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4);
+		u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4);
+		u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4);
+
+		m_timeOffset = bx::getHPCounter();
+
+		m_useIndirect = true;
+
+		imguiCreate();
+	}
+
+	int shutdown() override
+	{
+		imguiDestroy();
+
+		// Cleanup.
+
+		bgfx::destroy(m_programMainPass);
+		bgfx::destroy(m_programOcclusionPass);
+		bgfx::destroy(m_programDownscaleHiZ);
+		bgfx::destroy(m_programOccludeProps);
+		bgfx::destroy(m_programStreamCompaction);
+
+		for (uint16_t i = 0; i < m_noofProps; i++)
+		{
+			Prop& prop = m_props[i];
+
+			bgfx::destroy(prop.m_indexbufferHandle);
+			bgfx::destroy(prop.m_vertexbufferHandle);
+
+			delete[] prop.m_indices;
+			delete[] prop.m_vertices;
+			delete[] prop.m_instances;
+		}
+
+		delete[] m_props;
+
+		bgfx::destroy(m_hiZDepthBuffer);
+		bgfx::destroy(m_hiZBuffer);
+		bgfx::destroy(m_indirectBuffer);
+		bgfx::destroy(m_indirectBufferData);
+		bgfx::destroy(m_instanceBoundingBoxes);
+		bgfx::destroy(m_drawcallInstanceCounts);
+		bgfx::destroy(m_instancePredicates);
+		bgfx::destroy(m_instanceBuffer);
+		bgfx::destroy(m_culledInstanceBuffer);
+
+		bgfx::destroy(m_allPropsVertexbufferHandle);
+		bgfx::destroy(m_allPropsIndexbufferHandle);
+
+		bgfx::destroy(s_texOcclusionDepthIn);
+		bgfx::destroy(u_inputRTSize);
+		bgfx::destroy(u_cullingConfig);
+		bgfx::destroy(u_colour);
+
+		delete[] m_allPropVerticesDataCPU;
+		delete[] m_allPropIndicesDataCPU;
+		delete[] m_indirectBufferDataCPU;
+
+		// Shutdown bgfx.
+		bgfx::shutdown();
+
+		return 0;
+	}
+
+	//renders the occluders to a depth buffer
+	void renderOcclusionBufferPass()
+	{
+		// Setup the occlusion pass projection
+		bx::mtxProj(m_occlusionProj, 60.0f, float(m_hiZwidth) / float(m_hiZheight), 0.1f, 500.0f, bgfx::getCaps()->homogeneousDepth);
+
+		bgfx::setViewTransform(RENDER_PASS_HIZ_ID, m_mainView, m_occlusionProj);
+
+		bgfx::setViewFrameBuffer(RENDER_PASS_HIZ_ID, m_hiZDepthBuffer);
+		bgfx::setViewRect(RENDER_PASS_HIZ_ID, 0, 0, uint16_t(m_hiZwidth), uint16_t(m_hiZheight));
+
+		const uint16_t instanceStride = sizeof(InstanceData);
+
+		// render all instances of the occluder meshes
+		for (uint16_t i = 0; i < m_noofProps; i++)
+		{
+			Prop& prop = m_props[i];
+
+			if (prop.m_renderPass & RenderPass::Occlusion)
+			{
+				const uint32_t numInstances = prop.m_noofInstances;
+
+				// render instances to the occlusion buffer
+				if (numInstances == bgfx::getAvailInstanceDataBuffer(numInstances, instanceStride))
+				{
+					bgfx::InstanceDataBuffer instanceBuffer;
+
+					bgfx::allocInstanceDataBuffer(&instanceBuffer, numInstances, instanceStride);
+
+					InstanceData *data = (InstanceData *) instanceBuffer.data;
+
+					for (uint32_t j = 0; j < numInstances; j++)
+					{
+						//we only need the world matrix for the occlusion pass
+						memcpy(data->m_world, prop.m_instances[j].m_world, sizeof(data->m_world));
+						data++;
+					}
+
+					// Set vertex and index buffer.
+					bgfx::setVertexBuffer(0, prop.m_vertexbufferHandle);
+					bgfx::setIndexBuffer(prop.m_indexbufferHandle);
+
+					// Set instance data buffer.
+					bgfx::setInstanceDataBuffer(&instanceBuffer);
+
+					// Set render states.
+					bgfx::setState(BGFX_STATE_DEFAULT);
+
+					// Submit primitive for rendering to view.
+					bgfx::submit(RENDER_PASS_HIZ_ID, m_programOcclusionPass);
+				}
+			}
+		}
+	}
+
+	//downscale the occluder depth buffer to create a mipmap chain
+	void renderDownscalePass()
+	{
+		uint32_t width = m_hiZwidth;
+		uint32_t height = m_hiZheight;
+		 
+		for (uint8_t i = 0; i < m_noofHiZMips; i++)
+		{
+			float coordinateScale = i > 0 ? 2.0f : 1.0f;
+
+			float inputRendertargetSize[4] = { (float)width, (float)height, coordinateScale, coordinateScale };
+			bgfx::setUniform(u_inputRTSize, inputRendertargetSize);
+
+			if (i > 0)
+			{
+				//down scale mip 1 onwards
+				width /= 2;
+				height /= 2;
+
+				bgfx::setImage(0, getTexture(m_hiZBuffer, 0), i - 1, bgfx::Access::Read);
+				bgfx::setImage(1, getTexture(m_hiZBuffer, 0), i, bgfx::Access::Write);
+			}
+			else
+			{
+				//copy mip zero over to the hi Z buffer.
+				//We can't currently use blit as it requires same format and CopyResource is not exposed.
+				bgfx::setImage(0, getTexture(m_hiZDepthBuffer, 0), 0, bgfx::Access::Read);
+				bgfx::setImage(1, getTexture(m_hiZBuffer, 0), 0, bgfx::Access::Write);
+			}
+
+			bgfx::dispatch(RENDER_PASS_HIZ_DOWNSCALE_ID, m_programDownscaleHiZ, width/16, height/16);
+		}
+	}
+
+	//perform the occlusion using the mip chain
+	void renderOccludePropsPass()
+	{
+		//run the computer shader to determine visibility of each instance
+		bgfx::setTexture(0, s_texOcclusionDepthIn, bgfx::getTexture(m_hiZBuffer));
+
+		bgfx::setBuffer(1, m_instanceBoundingBoxes, bgfx::Access::Read);
+		bgfx::setBuffer(2, m_drawcallInstanceCounts, bgfx::Access::ReadWrite);
+		bgfx::setBuffer(3, m_instancePredicates, bgfx::Access::Write);
+
+		float inputRendertargetSize[4] = { (float)m_hiZwidth, (float)m_hiZheight, 1.0f/ m_hiZwidth, 1.0f/ m_hiZheight };
+		bgfx::setUniform(u_inputRTSize, inputRendertargetSize);
+
+		//store a rounded-up, power of two instance count for the stream compaction step
+		float noofInstancesPowOf2 = (float)pow(2, floor(log(m_totalInstancesCount) / log(2)) + 1);
+
+		float cullingConfig[4] = { (float)m_totalInstancesCount, noofInstancesPowOf2 , (float)m_noofHiZMips, (float)m_noofProps };
+		bgfx::setUniform(u_cullingConfig, cullingConfig);
+
+		//set the view/projection transforms so that the compute shader can receive the viewProjection matrix automagically
+		bgfx::setViewTransform(RENDER_PASS_OCCLUDE_PROPS_ID, m_mainView, m_occlusionProj);
+
+		uint16_t groupX = bx::uint16_max(m_totalInstancesCount / 64 + 1, 1);
+
+		bgfx::dispatch(RENDER_PASS_OCCLUDE_PROPS_ID, m_programOccludeProps, groupX, 1, 1);
+
+		//perform stream compaction to remove occluded instances
+
+		//the per drawcall data that is constant (noof indices/vertices and offsets to vertex/index buffers)
+	 	bgfx::setBuffer(0, m_indirectBufferData, bgfx::Access::Read);
+		//instance data for all instances (pre culling)
+		bgfx::setBuffer(1, m_instanceBuffer, bgfx::Access::Read);
+		//per instance visibility (output of culling pass)
+		bgfx::setBuffer(2, m_instancePredicates, bgfx::Access::Read);
+
+		//how many instances per drawcall
+		bgfx::setBuffer(3, m_drawcallInstanceCounts, bgfx::Access::ReadWrite);
+		//drawcall data that will drive drawIndirect
+		bgfx::setBuffer(4, m_indirectBuffer, bgfx::Access::ReadWrite);
+		//culled instance data
+		bgfx::setBuffer(5, m_culledInstanceBuffer, bgfx::Access::Write);
+
+		bgfx::setUniform(u_cullingConfig, cullingConfig);
+
+		bgfx::dispatch(RENDER_PASS_COMPACT_STREAM_ID, m_programStreamCompaction, 1, 1, 1);
+		
+	}
+
+	//render the unoccluded props to the screen
+	void renderMainPass()
+	{
+		// Set view and projection matrix for view 0.
+		const bgfx::HMD* hmd = bgfx::getHMD();
+		if (NULL != hmd && 0 != (hmd->flags & BGFX_HMD_RENDERING))
+		{
+			bgfx::setViewTransform(RENDER_PASS_MAIN_ID, m_mainView, hmd->eye[0].projection, BGFX_VIEW_STEREO, hmd->eye[1].projection);
+
+			// Set view 0 default viewport.
+			//
+			// Use HMD's width/height since HMD's internal frame buffer size
+			// might be much larger than window size.
+			bgfx::setViewRect(RENDER_PASS_MAIN_ID, 0, 0, hmd->width, hmd->height);
+		}
+		else
+		{
+			bgfx::setViewTransform(RENDER_PASS_MAIN_ID, m_mainView, m_mainProj);
+
+			// Set view 0 default viewport.
+			bgfx::setViewRect(RENDER_PASS_MAIN_ID, 0, 0, uint16_t(m_width), uint16_t(m_height));
+		}
+
+		// Set render states.
+		bgfx::setState(BGFX_STATE_DEFAULT);
+
+		const uint16_t instanceStride = sizeof(InstanceData);
+
+		// Set "material" data (currently a colour only)
+		bgfx::setUniform(u_colour, &m_materials[0].m_colour, m_noofMaterials);
+		
+		if (m_useIndirect)
+		{
+			// Set vertex and index buffer.
+			bgfx::setVertexBuffer(0, m_allPropsVertexbufferHandle);
+			bgfx::setIndexBuffer( m_allPropsIndexbufferHandle);
+
+			// Set instance data buffer.
+			bgfx::setInstanceDataBuffer(m_culledInstanceBuffer,  0,  m_totalInstancesCount );
+
+			bgfx::submit(RENDER_PASS_MAIN_ID, m_programMainPass, m_indirectBuffer, 0, m_noofProps);
+		}
+		else
+		{
+			// render all props using regular instancing
+			for (uint16_t i = 0; i < m_noofProps; i++)
+			{
+				Prop& prop = m_props[i];
+
+				if (prop.m_renderPass & RenderPass::MainPass)
+				{
+					const uint32_t numInstances = prop.m_noofInstances;
+
+					if (numInstances == bgfx::getAvailInstanceDataBuffer(numInstances, instanceStride))
+					{
+						bgfx::InstanceDataBuffer instanceBuffer;
+
+						bgfx::allocInstanceDataBuffer(&instanceBuffer, numInstances, instanceStride);
+
+						InstanceData *data = (InstanceData *)instanceBuffer.data;
+
+						for (uint32_t j = 0; j < numInstances; j++)
+						{
+							//copy world matrix
+							memcpy(data->m_world, prop.m_instances[j].m_world, sizeof(data->m_world));
+							//pack the material ID into the world transform
+							data->m_world[3] = prop.m_materialID;
+							data++;
+						}
+
+						// Set vertex and index buffer.
+						bgfx::setVertexBuffer(0, prop.m_vertexbufferHandle);
+						bgfx::setIndexBuffer(prop.m_indexbufferHandle);
+
+						// Set instance data buffer.
+						bgfx::setInstanceDataBuffer(&instanceBuffer);
+
+						bgfx::submit(RENDER_PASS_MAIN_ID, m_programMainPass);
+					}
+				}
+			}
+		}
+	}
+
+	bool update() override
+	{
+		if (!entry::processEvents(m_width, m_height, m_debug, m_reset, &m_mouseState) )
+		{
+			imguiBeginFrame(m_mouseState.m_mx
+				,  m_mouseState.m_my
+				, (m_mouseState.m_buttons[entry::MouseButton::Left  ] ? IMGUI_MBUT_LEFT   : 0)
+				| (m_mouseState.m_buttons[entry::MouseButton::Right ] ? IMGUI_MBUT_RIGHT  : 0)
+				| (m_mouseState.m_buttons[entry::MouseButton::Middle] ? IMGUI_MBUT_MIDDLE : 0)
+				,  m_mouseState.m_mz
+				, uint16_t(m_width)
+				, uint16_t(m_height)
+				);
+
+			showExampleDialog(this);
+
+			ImGui::SetNextWindowPos(
+				ImVec2(m_width - m_width / 5.0f - 10.0f, 10.0f)
+				, ImGuiCond_FirstUseEver
+			);
+			ImGui::SetNextWindowSize(
+				ImVec2(m_width / 5.0f, m_height / 6.0f)
+				, ImGuiCond_FirstUseEver
+			);
+			ImGui::Begin("Settings"
+				, NULL
+				, 0
+			);
+			ImGui::Checkbox("Use Draw Indirect", &m_useIndirect);
+
+			ImGui::End();
+
+			imguiEndFrame();
+
+			// This dummy draw call is here to make sure that view 0 is cleared
+			// if no other draw calls are submitted to view 0.
+			bgfx::touch(0);
+
+			int64_t now = bx::getHPCounter();
+			static int64_t last = now;
+			const int64_t frameTime = now - last;
+			last = now;
+			const double freq = double(bx::getHPFrequency());
+			const float deltaTimeSec = float(double(frameTime) / freq);
+
+			// Camera.
+			const bool mouseOverGui = ImGui::MouseOverArea();
+			m_mouse.update(float(m_mouseState.m_mx), float(m_mouseState.m_my), m_mouseState.m_mz, m_width, m_height);
+			if (!mouseOverGui)
+			{
+				if (m_mouseState.m_buttons[entry::MouseButton::Left])
+				{
+					m_camera.orbit(m_mouse.m_dx, m_mouse.m_dy);
+				}
+				else if (m_mouseState.m_buttons[entry::MouseButton::Right])
+				{
+					m_camera.dolly(m_mouse.m_dx + m_mouse.m_dy);
+				}
+				else if (0 != m_mouse.m_scroll)
+				{
+					m_camera.dolly(float(m_mouse.m_scroll)*0.05f);
+				}
+			}
+
+			m_camera.update(deltaTimeSec);
+
+			// Get renderer capabilities info.
+			const bgfx::Caps* caps = bgfx::getCaps();
+
+			// Check if instancing is supported.
+			if (0 == (BGFX_CAPS_INSTANCING & caps->supported) )
+			{
+				// When instancing is not supported by GPU, implement alternative
+				// code path that doesn't use instancing.
+				float time = (float)((bx::getHPCounter() - m_timeOffset) / double(bx::getHPFrequency()));
+				bool blink = uint32_t(time*3.0f)&1;
+				bgfx::dbgTextPrintf(0, 0, blink ? 0x1f : 0x01, " Instancing is not supported by GPU. ");
+			}
+			else
+			{
+				// calculate main view and project matrices as they are typically reused between passes.
+				m_camera.mtxLookAt(m_mainView);
+				bx::mtxProj(m_mainProj, 60.0f, float(m_width) / float(m_height), 0.1f, 500.0f, bgfx::getCaps()->homogeneousDepth);
+
+				//submit drawcalls for all passes
+				renderOcclusionBufferPass();
+
+				renderDownscalePass();
+
+				renderOccludePropsPass();
+
+				renderMainPass();
+			}
+
+			// Advance to next frame. Rendering thread will be kicked to
+			// process submitted rendering primitives.
+			bgfx::frame();
+
+			return true;
+		}
+
+		return false;
+	}
+
+	entry::MouseState m_mouseState;
+
+	uint32_t m_width;
+	uint32_t m_height;
+	uint32_t m_hiZwidth;
+	uint32_t m_hiZheight;
+	uint32_t m_debug;
+	uint32_t m_reset;
+
+	float m_mainView[16];
+	float m_mainProj[16];
+	float m_occlusionProj[16];
+
+	bgfx::ProgramHandle m_programMainPass;
+	bgfx::ProgramHandle m_programOcclusionPass;
+	bgfx::ProgramHandle m_programDownscaleHiZ;
+	bgfx::ProgramHandle m_programOccludeProps;
+	bgfx::ProgramHandle m_programStreamCompaction;
+
+	bgfx::FrameBufferHandle m_hiZDepthBuffer;
+	bgfx::FrameBufferHandle m_hiZBuffer;
+	bgfx::IndirectBufferHandle m_indirectBuffer;
+
+	bgfx::VertexBufferHandle m_allPropsVertexbufferHandle;
+	bgfx::IndexBufferHandle  m_allPropsIndexbufferHandle;
+	bgfx::IndexBufferHandle m_indirectBufferData;
+
+	PosVertex* m_allPropVerticesDataCPU;
+	uint16_t* m_allPropIndicesDataCPU;
+	uint32_t* m_indirectBufferDataCPU;
+
+	bgfx::DynamicVertexBufferHandle m_instanceBoundingBoxes;
+	bgfx::DynamicIndexBufferHandle m_drawcallInstanceCounts;
+	bgfx::DynamicIndexBufferHandle m_instancePredicates;
+	bgfx::VertexBufferHandle m_instanceBuffer;
+	bgfx::DynamicVertexBufferHandle m_culledInstanceBuffer;
+
+	bgfx::UniformHandle s_texOcclusionDepthIn;
+	bgfx::UniformHandle u_inputRTSize;
+	bgfx::UniformHandle u_cullingConfig;
+	bgfx::UniformHandle u_colour;
+
+	Prop*	m_props;
+	Material* m_materials;
+	uint16_t m_noofProps;
+	uint16_t m_noofMaterials;
+	uint16_t m_totalInstancesCount;
+
+	static const uint16_t s_maxNoofProps = 10;
+	static const uint16_t s_maxNoofPerPropInstances = 200;
+
+	static const uint16_t s_maxNoofInstances = 2048;
+
+	int64_t m_timeOffset;
+
+	uint8_t m_noofHiZMips;
+
+	bool m_useIndirect;
+
+	Camera m_camera;
+	Mouse m_mouse;
+
+};
+
+} // namespace
+
+ENTRY_IMPLEMENT_MAIN(GPUDrivenRendering, "37-gpudrivenrendering", "GPU-Driven Rendering.");
diff --git a/examples/37-gpudrivenrendering/makefile b/examples/37-gpudrivenrendering/makefile
new file mode 100644
index 000000000..171709170
--- /dev/null
+++ b/examples/37-gpudrivenrendering/makefile
@@ -0,0 +1,10 @@
+#
+# Copyright 2011-2018 Branimir Karadzic. All rights reserved.
+# License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+#
+
+BGFX_DIR=../..
+RUNTIME_DIR=$(BGFX_DIR)/examples/runtime
+BUILD_DIR=../../.build
+
+include $(BGFX_DIR)/scripts/shader.mk
diff --git a/examples/37-gpudrivenrendering/varying.def.sc b/examples/37-gpudrivenrendering/varying.def.sc
new file mode 100644
index 000000000..b55648db6
--- /dev/null
+++ b/examples/37-gpudrivenrendering/varying.def.sc
@@ -0,0 +1,7 @@
+uint v_materialID : TEXCOORD0;
+
+vec3 a_position  : POSITION;
+vec4 i_data0     : TEXCOORD7;
+vec4 i_data1     : TEXCOORD6;
+vec4 i_data2     : TEXCOORD5;
+vec4 i_data3     : TEXCOORD4;
diff --git a/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc b/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc
new file mode 100644
index 000000000..ece512702
--- /dev/null
+++ b/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc
@@ -0,0 +1,4 @@
+vec2 v_texcoord0 : TEXCOORD0;
+
+vec3 a_position  : POSITION;
+vec2 a_texcoord0 : TEXCOORD0;
diff --git a/examples/37-gpudrivenrendering/vs_instancedIndirectRendering.sc b/examples/37-gpudrivenrendering/vs_instancedIndirectRendering.sc
new file mode 100644
index 000000000..a29e24214
--- /dev/null
+++ b/examples/37-gpudrivenrendering/vs_instancedIndirectRendering.sc
@@ -0,0 +1,24 @@
+$input a_position, i_data0, i_data1, i_data2, i_data3
+$output v_materialID
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+void main()
+{
+	mat4 model;
+	model[0] = i_data0;
+	model[1] = i_data1;
+	model[2] = i_data2;
+	model[3] = i_data3;
+
+	v_materialID = i_data0.w;
+	model[0][3] = 0;
+	
+	vec4 worldPos = instMul(model, vec4(a_position, 1.0) );
+	gl_Position = mul(u_viewProj, worldPos);
+}
diff --git a/examples/37-gpudrivenrendering/vs_renderOcclusion.sc b/examples/37-gpudrivenrendering/vs_renderOcclusion.sc
new file mode 100644
index 000000000..dcfe4b606
--- /dev/null
+++ b/examples/37-gpudrivenrendering/vs_renderOcclusion.sc
@@ -0,0 +1,20 @@
+$input a_position, i_data0, i_data1, i_data2, i_data3, i_data4
+
+/*
+ * Copyright 2018 Kostas Anagnostou. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "../common/common.sh"
+
+void main()
+{
+	mat4 model;
+	model[0] = i_data0;
+	model[1] = i_data1;
+	model[2] = i_data2;
+	model[3] = i_data3;
+
+	vec4 worldPos = instMul(model, vec4(a_position, 1.0) );
+	gl_Position = mul(u_viewProj, worldPos);
+}