From 90aadf835b0c3c1c3b110cb4ecc8361eb25d0803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Branimir=20Karad=C5=BEi=C4=87?= Date: Sun, 4 Mar 2018 16:11:05 -0800 Subject: [PATCH] 37-gpudrivenrendering: Fixed GL shaders. --- .../37-gpudrivenrendering/cs_downscaleHiZ.sc | 46 ++++----- .../37-gpudrivenrendering/cs_occludeProps.sc | 53 +++++------ .../cs_streamCompaction.sc | 93 ++++++++++--------- .../fs_instancedIndirectRendering.sc | 8 +- .../gpudrivenrendering.cpp | 10 +- examples/37-gpudrivenrendering/varying.def.sc | 6 +- .../varying_pos_tex0.def.sc | 4 - scripts/genie.lua | 1 + src/bgfx_compute.sh | 41 ++------ 9 files changed, 123 insertions(+), 139 deletions(-) delete mode 100644 examples/37-gpudrivenrendering/varying_pos_tex0.def.sc diff --git a/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc b/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc index 06d42efc4..8be37bf0a 100644 --- a/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc +++ b/examples/37-gpudrivenrendering/cs_downscaleHiZ.sc @@ -14,31 +14,35 @@ uniform vec4 u_inputRTSize; NUM_THREADS(16, 16, 1) void main() { - //this shader can be used to both copy a mip over to the output and downscale it. - - ivec2 coord = gl_GlobalInvocationID.xy; - - if (all(coord.xy < u_inputRTSize.xy)) - { - float maxDepth = 1.0; - - if ( u_inputRTSize.z > 1) - { - vec4 depths = vec4( imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy ).r, - imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,0) ).r, - imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(0,1)).r, - imageLoad(s_texOcclusionDepthIn, u_inputRTSize.zw * coord.xy + ivec2(1,1)).r - ); + // this shader can be used to both copy a mip over to the output and downscale it. - //find and return max depth - maxDepth = max(max(depths.x, depths.y), max(depths.z, depths.w)); + ivec2 coord = ivec2(gl_GlobalInvocationID.xy); + + if (all(lessThan(coord.xy, u_inputRTSize.xy) ) ) + { + float maxDepth = 1.0; + + if (u_inputRTSize.z > 1) + { + vec4 depths = vec4( + imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy ) ).x + , imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 0.0) ) ).x + , imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(0.0, 1.0) ) ).x + , imageLoad(s_texOcclusionDepthIn, ivec2(u_inputRTSize.zw * coord.xy + ivec2(1.0, 1.0) ) ).x + ); + + // find and return max depth + maxDepth = max( + max(depths.x, depths.y) + , max(depths.z, depths.w) + ); } else { - //do not downscale, just copy the value over to the output rendertarget - maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy ).r; + // do not downscale, just copy the value over to the output rendertarget + maxDepth = imageLoad(s_texOcclusionDepthIn, coord.xy).x; } - + imageStore(s_texOcclusionDepthOut, coord, vec4(maxDepth,0,0,1) ); } -} \ No newline at end of file +} diff --git a/examples/37-gpudrivenrendering/cs_occludeProps.sc b/examples/37-gpudrivenrendering/cs_occludeProps.sc index b9fa40171..0183514f8 100644 --- a/examples/37-gpudrivenrendering/cs_occludeProps.sc +++ b/examples/37-gpudrivenrendering/cs_occludeProps.sc @@ -14,38 +14,39 @@ BUFFER_WR(instancePredicates, bool, 3); uniform vec4 u_inputRTSize; uniform vec4 u_cullingConfig; - + NUM_THREADS(64, 1, 1) void main() { bool predicate = false; - + //make sure that we not processing more instances than available - if (gl_GlobalInvocationID.x < (int)u_cullingConfig.x) + if (gl_GlobalInvocationID.x < uint(u_cullingConfig.x) ) { //get the bounding box for this instance vec4 bboxMin = instanceDataIn[2 * gl_GlobalInvocationID.x] ; vec3 bboxMax = instanceDataIn[2 * gl_GlobalInvocationID.x + 1].xyz; - - int drawcallID = bboxMin.w; - + + int drawcallID = int(bboxMin.w); + //Adapted from http://blog.selfshadow.com/publications/practical-visibility/ vec3 bboxSize = bboxMax.xyz - bboxMin.xyz; - vec3 boxCorners[] = { bboxMin.xyz, - bboxMin.xyz + vec3(bboxSize.x,0,0), - bboxMin.xyz + vec3(0, bboxSize.y,0), - bboxMin.xyz + vec3(0, 0, bboxSize.z), - bboxMin.xyz + vec3(bboxSize.xy,0), - bboxMin.xyz + vec3(0, bboxSize.yz), - bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z), - bboxMin.xyz + bboxSize.xyz - }; - float minZ = 1; - vec2 minXY = vec2(1,1); - vec2 maxXY = vec2(0,0); + vec3 boxCorners[] = { + bboxMin.xyz, + bboxMin.xyz + vec3(bboxSize.x,0,0), + bboxMin.xyz + vec3(0, bboxSize.y,0), + bboxMin.xyz + vec3(0, 0, bboxSize.z), + bboxMin.xyz + vec3(bboxSize.xy,0), + bboxMin.xyz + vec3(0, bboxSize.yz), + bboxMin.xyz + vec3(bboxSize.x, 0, bboxSize.z), + bboxMin.xyz + bboxSize.xyz + }; + float minZ = 1.0; + vec2 minXY = vec2(1.0, 1.0); + vec2 maxXY = vec2(0.0, 0.0); - [unroll] + UNROLL for (int i = 0; i < 8; i++) { //transform World space aaBox to NDC @@ -61,20 +62,20 @@ void main() minXY = min(clipPos.xy, minXY); maxXY = max(clipPos.xy, maxXY); - minZ = saturate(min(minZ, clipPos.z)); + minZ = saturate(min(minZ, clipPos.z)); } vec4 boxUVs = vec4(minXY, maxXY); // Calculate hi-Z buffer mip - ivec2 size = (maxXY - minXY) * u_inputRTSize.xy; + ivec2 size = ivec2( (maxXY - minXY) * u_inputRTSize.xy); float mip = ceil(log2(max(size.x, size.y))); mip = clamp(mip, 0, u_cullingConfig.z); // Texel footprint for the lower (finer-grained) level - float level_lower = max(mip - 1, 0); - vec2 scale = exp2(-level_lower); + float level_lower = max(mip - 1, 0); + vec2 scale = vec2_splat(exp2(-level_lower) ); vec2 a = floor(boxUVs.xy*scale); vec2 b = ceil(boxUVs.zw*scale); vec2 dims = b - a; @@ -88,8 +89,8 @@ void main() texture2DLod(s_texOcclusionDepth, boxUVs.zy, mip).x, texture2DLod(s_texOcclusionDepth, boxUVs.xw, mip).x, texture2DLod(s_texOcclusionDepth, boxUVs.zw, mip).x, - }; - + }; + //find the max depth float maxDepth = max( max(depth.x, depth.y), max(depth.z, depth.w) ); @@ -98,7 +99,7 @@ void main() predicate = true; //increase instance count for this particular prop type - InterlockedAdd( drawcallInstanceCount[ drawcallID ], 1); + atomicAdd(drawcallInstanceCount[ drawcallID ], 1); } } diff --git a/examples/37-gpudrivenrendering/cs_streamCompaction.sc b/examples/37-gpudrivenrendering/cs_streamCompaction.sc index b1791cca1..f857e603e 100644 --- a/examples/37-gpudrivenrendering/cs_streamCompaction.sc +++ b/examples/37-gpudrivenrendering/cs_streamCompaction.sc @@ -21,102 +21,105 @@ BUFFER_RW(drawcallData, uvec4, 4); BUFFER_WR(instanceDataOut, vec4, 5); uniform vec4 u_cullingConfig; - + // Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris -groupshared uint temp[2048]; +SHARED uint temp[2048]; NUM_THREADS(1024, 1, 1) void main() { - int tID = gl_GlobalInvocationID.x; - int NoofInstancesPowOf2 = u_cullingConfig.y; - int NoofDrawcalls = u_cullingConfig.w; + uint tID = gl_GlobalInvocationID.x; + int NoofInstancesPowOf2 = int(u_cullingConfig.y); + int NoofDrawcalls = int(u_cullingConfig.w); int offset = 1; - temp[2 * tID] = instancePredicates[2 * tID]; // load input into shared memory - temp[2 * tID + 1] = instancePredicates[2 * tID + 1]; + temp[2 * tID ] = uint(instancePredicates[2 * tID ]); // load input into shared memory + temp[2 * tID + 1] = uint(instancePredicates[2 * tID + 1]); int d; - + //perform reduction - for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1) + for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1) { - GroupMemoryBarrierWithGroupSync(); + barrier(); if (tID < d) { - int ai = offset * (2 * tID + 1) - 1; - int bi = offset * (2 * tID + 2) - 1; + int ai = int(offset * (2 * tID + 1) - 1); + int bi = int(offset * (2 * tID + 2) - 1); temp[bi] += temp[ai]; } + offset *= 2; } // clear the last element if (tID == 0) + { temp[NoofInstancesPowOf2 - 1] = 0; + } - //perform downsweep and build scan + // perform downsweep and build scan for ( d = 1; d < NoofInstancesPowOf2; d *= 2) { offset >>= 1; - GroupMemoryBarrierWithGroupSync(); + barrier(); if (tID < d) { - int ai = offset * (2 * tID + 1) - 1; - int bi = offset * (2 * tID + 2) - 1; - int t = temp[ai]; + int ai = int(offset * (2 * tID + 1) - 1); + int bi = int(offset * (2 * tID + 2) - 1); + int t = int(temp[ai]); temp[ai] = temp[bi]; temp[bi] += t; } } - GroupMemoryBarrierWithGroupSync(); + barrier(); - int index = 2 * tID; + int index = int(2 * tID); - //scatter results - if (instancePredicates[index] != 0) - { - instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ]; - instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ]; - instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ]; - instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ]; - } - - index = 2 * tID + 1; - - if (instancePredicates[index] != 0) + // scatter results + if (instancePredicates[index]) { - instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ]; - instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ]; - instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ]; - instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ]; + instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ]; + instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1]; + instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2]; + instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3]; } - + + index = int(2 * tID + 1); + + if (instancePredicates[index]) + { + instanceDataOut[4 * temp[index] ] = instanceDataIn[4 * index ]; + instanceDataOut[4 * temp[index] + 1] = instanceDataIn[4 * index + 1]; + instanceDataOut[4 * temp[index] + 2] = instanceDataIn[4 * index + 2]; + instanceDataOut[4 * temp[index] + 3] = instanceDataIn[4 * index + 3]; + } + if (tID == 0) { uint startInstance = 0; - + //copy data to indirect buffer, could possible be done in a different compute shader for (int k = 0; k < NoofDrawcalls; k++) - { + { drawIndexedIndirect( - drawcallData, - k, + drawcallData, + k, drawcallConstData[ k * 3 ], //number of indices drawcallInstanceCount[k], //number of instances - drawcallConstData[ k * 3 + 1 ], //offset into the index buffer - drawcallConstData[ k * 3 + 2 ], //offset into the vertex buffer + drawcallConstData[ k * 3 + 1 ], //offset into the index buffer + drawcallConstData[ k * 3 + 2 ], //offset into the vertex buffer startInstance //offset into the instance buffer ); startInstance += drawcallInstanceCount[k]; - + drawcallInstanceCount[k] = 0; } } - -} \ No newline at end of file + +} diff --git a/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc b/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc index 08033ac2f..b638de1b0 100644 --- a/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc +++ b/examples/37-gpudrivenrendering/fs_instancedIndirectRendering.sc @@ -11,14 +11,14 @@ uniform vec4 u_colour[50]; void main() { - vec4 colour = u_colour[v_materialID.x]; - + vec4 colour = u_colour[uint(v_materialID)]; + if ( colour.w < 1.0f ) { //render dithered alpha - if ( (gl_FragCoord.x % 2) == (gl_FragCoord.y % 2) ) + if ( (int(gl_FragCoord.x) % 2) == (int(gl_FragCoord.y) % 2) ) discard; } - + gl_FragColor = vec4( colour.xyz,1 ); } diff --git a/examples/37-gpudrivenrendering/gpudrivenrendering.cpp b/examples/37-gpudrivenrendering/gpudrivenrendering.cpp index ab746810f..ac0a8c978 100644 --- a/examples/37-gpudrivenrendering/gpudrivenrendering.cpp +++ b/examples/37-gpudrivenrendering/gpudrivenrendering.cpp @@ -418,6 +418,11 @@ public: // Enable debug text. bgfx::setDebug(m_debug); + //create uniforms + u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4); + u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4); + u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4); + //create props { m_totalInstancesCount = 0; @@ -769,11 +774,6 @@ public: //create samplers s_texOcclusionDepthIn = bgfx::createUniform("s_texOcclusionDepthIn", bgfx::UniformType::Int1); - //create uniforms - u_inputRTSize = bgfx::createUniform("u_inputRTSize", bgfx::UniformType::Vec4); - u_cullingConfig = bgfx::createUniform("u_cullingConfig", bgfx::UniformType::Vec4); - u_colour = bgfx::createUniform("u_colour", bgfx::UniformType::Vec4); - m_timeOffset = bx::getHPCounter(); m_useIndirect = true; diff --git a/examples/37-gpudrivenrendering/varying.def.sc b/examples/37-gpudrivenrendering/varying.def.sc index b55648db6..f0615767d 100644 --- a/examples/37-gpudrivenrendering/varying.def.sc +++ b/examples/37-gpudrivenrendering/varying.def.sc @@ -1,7 +1,9 @@ -uint v_materialID : TEXCOORD0; - vec3 a_position : POSITION; +vec2 a_texcoord0 : TEXCOORD0; vec4 i_data0 : TEXCOORD7; vec4 i_data1 : TEXCOORD6; vec4 i_data2 : TEXCOORD5; vec4 i_data3 : TEXCOORD4; + +vec2 v_texcoord0 : TEXCOORD0; +float v_materialID : TEXCOORD0; diff --git a/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc b/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc deleted file mode 100644 index ece512702..000000000 --- a/examples/37-gpudrivenrendering/varying_pos_tex0.def.sc +++ /dev/null @@ -1,4 +0,0 @@ -vec2 v_texcoord0 : TEXCOORD0; - -vec3 a_position : POSITION; -vec2 a_texcoord0 : TEXCOORD0; diff --git a/scripts/genie.lua b/scripts/genie.lua index daae2f8ca..fd6f21f53 100644 --- a/scripts/genie.lua +++ b/scripts/genie.lua @@ -464,6 +464,7 @@ or _OPTIONS["with-combined-examples"] then , "34-mvs" , "35-dynamic" , "36-sky" + , "37-gpudrivenrendering" ) -- C99 source doesn't compile under WinRT settings diff --git a/src/bgfx_compute.sh b/src/bgfx_compute.sh index a997fc3e7..d1422e8ab 100644 --- a/src/bgfx_compute.sh +++ b/src/bgfx_compute.sh @@ -251,39 +251,16 @@ __IMAGE_IMPL_A(r32ui, x, uvec4, xxxx) __IMAGE_IMPL_A(rg32ui, xy, uvec4, xyyy) __IMAGE_IMPL_A(rgba32ui, xyzw, uvec4, xyzw) -#define __ATOMIC_IMPL_TYPE(_genType, _glFunc, _dxFunc) \ - _genType _glFunc(inout _genType _mem, _genType _data) \ - { \ - _genType result; \ - _dxFunc(_mem, _data, result); \ - return result; \ - } +#define atomicAdd(_mem, _data) InterlockedAdd(_mem, _data) +#define atomicAnd(_mem, _data) InterlockedAnd(_mem, _data) +#define atomicExchange(_mem, _data) InterlockedExchange(_mem, _data) +#define atomicMax(_mem, _data) InterlockedMax(_mem, _data) +#define atomicMin(_mem, _data) InterlockedMin(_mem, _data) +#define atomicOr(_mem, _data) InterlockedOr(_mem, _data) +#define atomicXor(_mem, _data) InterlockedXor(_mem, _data) -#define __ATOMIC_IMPL(_glFunc, _dxFunc) \ - __ATOMIC_IMPL_TYPE(int, _glFunc, _dxFunc) \ - __ATOMIC_IMPL_TYPE(uint, _glFunc, _dxFunc) - -__ATOMIC_IMPL(atomicAdd, InterlockedAdd); -__ATOMIC_IMPL(atomicAnd, InterlockedAnd); -__ATOMIC_IMPL(atomicExchange, InterlockedExchange); -__ATOMIC_IMPL(atomicMax, InterlockedMax); -__ATOMIC_IMPL(atomicMin, InterlockedMin); -__ATOMIC_IMPL(atomicOr, InterlockedOr); -__ATOMIC_IMPL(atomicXor, InterlockedXor); - -int atomicCompSwap(inout int _mem, int _compare, int _data) -{ - int result; - InterlockedCompareExchange(_mem, _compare, _data, result); - return result; -} - -uint atomicCompSwap(inout uint _mem, uint _compare, uint _data) -{ - uint result; - InterlockedCompareExchange(_mem, _compare, _data, result); - return result; -} +#define atomicCompSwap(_mem, _compare, _data) \ + InterlockedCompareExchange(_mem,_compare, _data) // InterlockedCompareStore