mirror of
https://github.com/bkaradzic/bgfx.git
synced 2026-02-20 22:03:12 +01:00
122 lines
3.2 KiB
Python
122 lines
3.2 KiB
Python
|
|
/*
|
|
* Copyright 2018 Kostas Anagnostou. All rights reserved.
|
|
* License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
|
|
*/
|
|
|
|
#include "bgfx_compute.sh"
|
|
|
|
//the per drawcall data that is constant (noof indices and offsets to vertex/index buffers)
|
|
BUFFER_RO(drawcallConstData, uint, 0);
|
|
//instance data for all instances (pre culling)
|
|
BUFFER_RO(instanceDataIn, vec4, 1);
|
|
//per instance visibility (output of culling pass)
|
|
BUFFER_RO(instancePredicates, bool, 2);
|
|
|
|
//how many instances per drawcall
|
|
BUFFER_RW(drawcallInstanceCount, uint, 3);
|
|
//drawcall data that will drive drawIndirect
|
|
BUFFER_RW(drawcallData, uvec4, 4);
|
|
//culled instance data
|
|
BUFFER_WR(instanceDataOut, vec4, 5);
|
|
|
|
uniform vec4 u_cullingConfig;
|
|
|
|
// Based on Parallel Prefix Sum (Scan) with CUDA by Mark Harris
|
|
groupshared uint temp[2048];
|
|
|
|
NUM_THREADS(1024, 1, 1)
|
|
void main()
|
|
{
|
|
int tID = gl_GlobalInvocationID.x;
|
|
int NoofInstancesPowOf2 = u_cullingConfig.y;
|
|
int NoofDrawcalls = u_cullingConfig.w;
|
|
|
|
int offset = 1;
|
|
temp[2 * tID] = instancePredicates[2 * tID]; // load input into shared memory
|
|
temp[2 * tID + 1] = instancePredicates[2 * tID + 1];
|
|
|
|
int d;
|
|
|
|
//perform reduction
|
|
for (d = NoofInstancesPowOf2 >> 1; d > 0; d >>= 1)
|
|
{
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (tID < d)
|
|
{
|
|
int ai = offset * (2 * tID + 1) - 1;
|
|
int bi = offset * (2 * tID + 2) - 1;
|
|
temp[bi] += temp[ai];
|
|
}
|
|
offset *= 2;
|
|
}
|
|
|
|
// clear the last element
|
|
if (tID == 0)
|
|
temp[NoofInstancesPowOf2 - 1] = 0;
|
|
|
|
//perform downsweep and build scan
|
|
for ( d = 1; d < NoofInstancesPowOf2; d *= 2)
|
|
{
|
|
offset >>= 1;
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (tID < d)
|
|
{
|
|
int ai = offset * (2 * tID + 1) - 1;
|
|
int bi = offset * (2 * tID + 2) - 1;
|
|
int t = temp[ai];
|
|
temp[ai] = temp[bi];
|
|
temp[bi] += t;
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
int index = 2 * tID;
|
|
|
|
//scatter results
|
|
if (instancePredicates[index] != 0)
|
|
{
|
|
instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
|
|
instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
|
|
instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
|
|
instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
|
|
}
|
|
|
|
index = 2 * tID + 1;
|
|
|
|
if (instancePredicates[index] != 0)
|
|
{
|
|
instanceDataOut[ 4 * temp[index] ] = instanceDataIn[ 4 * index ];
|
|
instanceDataOut[ 4 * temp[index] + 1 ] = instanceDataIn[ 4 * index + 1 ];
|
|
instanceDataOut[ 4 * temp[index] + 2 ] = instanceDataIn[ 4 * index + 2 ];
|
|
instanceDataOut[ 4 * temp[index] + 3 ] = instanceDataIn[ 4 * index + 3 ];
|
|
}
|
|
|
|
if (tID == 0)
|
|
{
|
|
uint startInstance = 0;
|
|
|
|
//copy data to indirect buffer, could possible be done in a different compute shader
|
|
for (int k = 0; k < NoofDrawcalls; k++)
|
|
{
|
|
drawIndexedIndirect(
|
|
drawcallData,
|
|
k,
|
|
drawcallConstData[ k * 3 ], //number of indices
|
|
drawcallInstanceCount[k], //number of instances
|
|
drawcallConstData[ k * 3 + 1 ], //offset into the index buffer
|
|
drawcallConstData[ k * 3 + 2 ], //offset into the vertex buffer
|
|
startInstance //offset into the instance buffer
|
|
);
|
|
|
|
startInstance += drawcallInstanceCount[k];
|
|
|
|
drawcallInstanceCount[k] = 0;
|
|
}
|
|
}
|
|
|
|
} |