From 7f8d42259923986c39515c28a4a04e91c16a6d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Sat, 3 May 2025 10:29:20 -0700 Subject: [PATCH] Updated meshoptimizer. --- 3rdparty/meshoptimizer/src/clusterizer.cpp | 467 +++++++++++++++++- .../{vcacheanalyzer.cpp => indexanalyzer.cpp} | 53 ++ 3rdparty/meshoptimizer/src/indexcodec.cpp | 28 +- 3rdparty/meshoptimizer/src/indexgenerator.cpp | 240 ++++----- 3rdparty/meshoptimizer/src/meshoptimizer.h | 124 ++++- .../{overdrawanalyzer.cpp => rasterizer.cpp} | 166 +++++-- 3rdparty/meshoptimizer/src/simplifier.cpp | 60 ++- 3rdparty/meshoptimizer/src/spatialorder.cpp | 68 +-- 3rdparty/meshoptimizer/src/vertexcodec.cpp | 9 +- 3rdparty/meshoptimizer/src/vertexfilter.cpp | 21 +- 3rdparty/meshoptimizer/src/vfetchanalyzer.cpp | 58 --- 11 files changed, 975 insertions(+), 319 deletions(-) rename 3rdparty/meshoptimizer/src/{vcacheanalyzer.cpp => indexanalyzer.cpp} (58%) rename 3rdparty/meshoptimizer/src/{overdrawanalyzer.cpp => rasterizer.cpp} (62%) delete mode 100644 3rdparty/meshoptimizer/src/vfetchanalyzer.cpp diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index 26d2fb11c..f3bbbd790 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -10,6 +10,7 @@ // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 // Jack Ritter. An Efficient Bounding Sphere. 1990 +// Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008 namespace meshopt { @@ -23,6 +24,9 @@ const size_t kMeshletMaxTriangles = 512; const size_t kMeshletMaxSeeds = 256; const size_t kMeshletAddSeeds = 4; +// To avoid excessive recursion for malformed inputs, we limit the maximum depth of the tree +const int kMeshletMaxTreeDepth = 50; + struct TriangleAdjacency2 { unsigned int* counts; @@ -144,37 +148,62 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un } } -static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride) +static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { + static const float kAxes[7][3] = { + // X, Y, Z + {1, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + + // XYZ, -XYZ, X-YZ, XY-Z; normalized to unit length + {0.57735026f, 0.57735026f, 0.57735026f}, + {-0.57735026f, 0.57735026f, 0.57735026f}, + {0.57735026f, -0.57735026f, 0.57735026f}, + {0.57735026f, 0.57735026f, -0.57735026f}, + }; + assert(count > 0); + assert(axis_count <= sizeof(kAxes) / sizeof(kAxes[0])); size_t points_stride_float = points_stride / sizeof(float); size_t radii_stride_float = radii_stride / sizeof(float); - // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates - size_t pmin[3] = {0, 0, 0}; - size_t pmax[3] = {0, 0, 0}; + // find extremum points along all axes; for each axis we get a pair of points with min/max coordinates + size_t pmin[7], pmax[7]; + float tmin[7], tmax[7]; + + for (size_t axis = 0; axis < axis_count; ++axis) + { + pmin[axis] = pmax[axis] = 0; + tmin[axis] = FLT_MAX; + tmax[axis] = -FLT_MAX; + } for (size_t i = 0; i < count; ++i) { const float* p = points + i * points_stride_float; float r = radii[i * radii_stride_float]; - for (int axis = 0; axis < 3; ++axis) + for (size_t axis = 0; axis < axis_count; ++axis) { - float bmin = points[pmin[axis] * points_stride_float + axis] - radii[pmin[axis] * radii_stride_float]; - float bmax = points[pmax[axis] * points_stride_float + axis] + radii[pmax[axis] * radii_stride_float]; + const float* ax = kAxes[axis]; - pmin[axis] = (p[axis] - r < bmin) ? i : pmin[axis]; - pmax[axis] = (p[axis] + r > bmax) ? i : pmax[axis]; + float tp = ax[0] * p[0] + ax[1] * p[1] + ax[2] * p[2]; + float tpmin = tp - r, tpmax = tp + r; + + pmin[axis] = (tpmin < tmin[axis]) ? i : pmin[axis]; + pmax[axis] = (tpmax > tmax[axis]) ? i : pmax[axis]; + tmin[axis] = (tpmin < tmin[axis]) ? tpmin : tmin[axis]; + tmax[axis] = (tpmax > tmax[axis]) ? tpmax : tmax[axis]; } } // find the pair of points with largest distance - int paxis = 0; + size_t paxis = 0; float paxisdr = 0; - for (int axis = 0; axis < 3; ++axis) + for (size_t axis = 0; axis < axis_count; ++axis) { const float* p1 = points + pmin[axis] * points_stride_float; const float* p2 = points + pmax[axis] * points_stride_float; @@ -698,6 +727,314 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, } } +struct BVHBox +{ + float min[3]; + float max[3]; +}; + +static void boxMerge(BVHBox& box, const BVHBox& other) +{ + for (int k = 0; k < 3; ++k) + { + box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k]; + box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k]; + } +} + +inline float boxSurface(const BVHBox& box) +{ + float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2]; + return sx * sy + sx * sz + sy * sz; +} + +inline unsigned int radixFloat(unsigned int v) +{ + // if sign bit is 0, flip sign bit + // if sign bit is 1, flip everything + unsigned int mask = (int(v) >> 31) | 0x80000000; + return v ^ mask; +} + +static void computeHistogram(unsigned int (&hist)[1024][3], const float* data, size_t count) +{ + memset(hist, 0, sizeof(hist)); + + const unsigned int* bits = reinterpret_cast(data); + + // compute 3 10-bit histograms in parallel (dropping 2 LSB) + for (size_t i = 0; i < count; ++i) + { + unsigned int id = radixFloat(bits[i]); + + hist[(id >> 2) & 1023][0]++; + hist[(id >> 12) & 1023][1]++; + hist[(id >> 22) & 1023][2]++; + } + + unsigned int sum0 = 0, sum1 = 0, sum2 = 0; + + // replace histogram data with prefix histogram sums in-place + for (int i = 0; i < 1024; ++i) + { + unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; + + hist[i][0] = sum0; + hist[i][1] = sum1; + hist[i][2] = sum2; + + sum0 += hx; + sum1 += hy; + sum2 += hz; + } + + assert(sum0 == count && sum1 == count && sum2 == count); +} + +static void radixPass(unsigned int* destination, const unsigned int* source, const float* keys, size_t count, unsigned int (&hist)[1024][3], int pass) +{ + const unsigned int* bits = reinterpret_cast(keys); + int bitoff = pass * 10 + 2; // drop 2 LSB to be able to use 3 10-bit passes + + for (size_t i = 0; i < count; ++i) + { + unsigned int id = (radixFloat(bits[source[i]]) >> bitoff) & 1023; + + destination[hist[id][pass]++] = source[i]; + } +} + +static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indices, size_t face_count, const float* vertex_positions, size_t vertex_count, size_t vertex_stride_float) +{ + (void)vertex_count; + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + const float* va = vertex_positions + vertex_stride_float * a; + const float* vb = vertex_positions + vertex_stride_float * b; + const float* vc = vertex_positions + vertex_stride_float * c; + + BVHBox& box = boxes[i]; + + for (int k = 0; k < 3; ++k) + { + box.min[k] = va[k] < vb[k] ? va[k] : vb[k]; + box.min[k] = vc[k] < box.min[k] ? vc[k] : box.min[k]; + + box.max[k] = va[k] > vb[k] ? va[k] : vb[k]; + box.max[k] = vc[k] > box.max[k] ? vc[k] : box.max[k]; + + centroids[i + face_count * k] = (box.min[k] + box.max[k]) / 2.f; + } + } +} + +static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices) +{ + // count number of unique vertices + size_t used_vertices = 0; + for (size_t i = 0; i < count; ++i) + { + unsigned int index = order[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0); + used[a] = used[b] = used[c] = 1; + } + + // reset used[] for future invocations + for (size_t i = 0; i < count; ++i) + { + unsigned int index = order[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + used[a] = used[b] = used[c] = -1; + } + + if (used_vertices > max_vertices) + return false; + + // mark meshlet boundary for future reassembly + assert(count > 0); + + boundary[0] = 1; + memset(boundary + 1, 0, count - 1); + + return true; +} + +static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles) +{ + for (size_t i = 0; i < count;) + { + size_t chunk = i + max_triangles <= count ? max_triangles : count - i; + + if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices)) + { + i += chunk; + continue; + } + + // chunk is vertex bound, split it into smaller meshlets + assert(chunk > max_vertices / 3); + + bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices); + i += max_vertices / 3; + } +} + +static bool bvhDivisible(size_t count, size_t min, size_t max) +{ + // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)] + // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv + // we avoid expensive integer divisions in the common case where min is <= max/2 + return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min); +} + +static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost) +{ + BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]]; + float* costs = static_cast(scratch); + + // accumulate SAH cost in forward and backward directions + for (size_t i = 0; i < count; ++i) + { + boxMerge(accuml, boxes[order[i]]); + boxMerge(accumr, boxes[order[count - 1 - i]]); + + costs[i] = boxSurface(accuml); + costs[i + count] = boxSurface(accumr); + } + + bool aligned = count >= min * 2 && bvhDivisible(count, min, max); + size_t end = aligned ? count - min : count - 1; + + float rmaxf = 1.f / float(int(max)); + + // find best split that minimizes SAH + size_t bestsplit = 0; + float bestcost = FLT_MAX; + + for (size_t i = min - 1; i < end; i += step) + { + size_t lsplit = i + 1, rsplit = count - (i + 1); + + if (!bvhDivisible(lsplit, min, max)) + continue; + if (aligned && !bvhDivisible(rsplit, min, max)) + continue; + + // costs[x] = inclusive surface area of boxes[0..x] + // costs[count-1-x] = inclusive surface area of boxes[x..count-1] + float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count]; + float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit)); + + if (cost > bestcost) + continue; + + // fill cost; use floating point math to avoid expensive integer modulo + int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit); + int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit); + + cost += fill * (float(lrest) * larea + float(rrest) * rarea); + + if (cost < bestcost) + { + bestcost = cost; + bestsplit = i + 1; + } + } + + *out_cost = bestcost; + return bestsplit; +} + +static void bvhPartition(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count) +{ + size_t l = 0, r = split; + + for (size_t i = 0; i < count; ++i) + { + unsigned char side = sides[order[i]]; + target[side ? r : l] = order[i]; + l += 1; + l -= side; + r += side; + } + + assert(l == split && r == count); +} + +static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + if (depth >= kMeshletMaxTreeDepth) + return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); + + if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices)) + return; + + unsigned int* axes[3] = {orderx, ordery, orderz}; + + // we can use step=1 unconditionally but to reduce the cost for min=max case we use step=max + size_t step = min_triangles == max_triangles && count > max_triangles ? max_triangles : 1; + + // if we could not pack the meshlet, we must be vertex bound + size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles; + + // only use fill weight if we are optimizing for triangle count + float fill = count <= max_triangles ? 0.f : fill_weight; + + // find best split that minimizes SAH + int bestk = -1; + size_t bestsplit = 0; + float bestcost = FLT_MAX; + + for (int k = 0; k < 3; ++k) + { + float axiscost = FLT_MAX; + size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost); + + if (axissplit && axiscost < bestcost) + { + bestk = k; + bestcost = axiscost; + bestsplit = axissplit; + } + } + + // this may happen if SAH costs along the admissible splits are NaN + if (bestk < 0) + return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); + + // mark sides of split for partitioning + unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int); + + for (size_t i = 0; i < bestsplit; ++i) + sides[axes[bestk][i]] = 0; + + for (size_t i = bestsplit; i < count; ++i) + sides[axes[bestk][i]] = 1; + + // partition all axes into two sides, maintaining order + unsigned int* temp = static_cast(scratch); + + for (int k = 0; k < 3; ++k) + { + if (k == bestk) + continue; + + unsigned int* axis = axes[k]; + memcpy(temp, axis, sizeof(unsigned int) * count); + bvhPartition(axis, temp, sides, bestsplit, count); + } + + bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); + bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); +} + } // namespace meshopt size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles) @@ -962,6 +1299,108 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle return meshlet_offset; } +size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); + assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + + if (index_count == 0) + return 0; + + size_t face_count = index_count / 3; + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + meshopt_Allocator allocator; + + // 3 floats plus 1 uint for sorting, or + // 2 floats for SAH costs, or + // 1 uint plus 1 byte for partitioning + float* scratch = allocator.allocate(face_count * 4); + + // compute bounding boxes and centroids for sorting + BVHBox* boxes = allocator.allocate(face_count); + bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float); + + unsigned int* axes = allocator.allocate(face_count * 3); + unsigned int* temp = reinterpret_cast(scratch) + face_count * 3; + + for (int k = 0; k < 3; ++k) + { + unsigned int* order = axes + k * face_count; + const float* keys = scratch + k * face_count; + + unsigned int hist[1024][3]; + computeHistogram(hist, keys, face_count); + + // 3-pass radix sort computes the resulting order into axes + for (size_t i = 0; i < face_count; ++i) + temp[i] = unsigned(i); + + radixPass(order, temp, keys, face_count, hist, 0); + radixPass(temp, order, keys, face_count, hist, 1); + radixPass(order, temp, keys, face_count, hist, 2); + } + + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + memset(used, -1, vertex_count * sizeof(short)); + + unsigned char* boundary = allocator.allocate(face_count); + + bvhSplit(boxes, &axes[0], &axes[face_count], &axes[face_count * 2], boundary, face_count, 0, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); + + // compute the desired number of meshlets; note that on some meshes with a lot of vertex bound clusters this might go over the bound + size_t meshlet_count = 0; + for (size_t i = 0; i < face_count; ++i) + { + assert(boundary[i] <= 1); + meshlet_count += boundary[i]; + } + + size_t meshlet_bound = meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles); + + // pack triangles into meshlets according to the order and boundaries marked by bvhSplit + meshopt_Meshlet meshlet = {}; + size_t meshlet_offset = 0; + size_t meshlet_pending = meshlet_count; + + for (size_t i = 0; i < face_count; ++i) + { + assert(boundary[i] <= 1); + bool split = i > 0 && boundary[i] == 1; + + // while we are over the limit, we ignore boundary[] data and disable splits until we free up enough space + if (split && meshlet_count > meshlet_bound && meshlet_offset + meshlet_pending >= meshlet_bound) + split = false; + + unsigned int index = axes[i]; + assert(index < face_count); + + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + // appends triangle to the meshlet and writes previous meshlet to the output if full + meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split); + meshlet_pending -= boundary[i]; + } + + if (meshlet.triangle_count) + { + finishMeshlet(meshlet, meshlet_triangles); + + meshlets[meshlet_offset++] = meshlet; + } + + assert(meshlet_offset <= meshlet_bound); + return meshlet_offset; +} + meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { using namespace meshopt; @@ -1022,13 +1461,13 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well float psphere[4] = {}; - computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0); + computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0, 7); float center[3] = {psphere[0], psphere[1], psphere[2]}; // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis float nsphere[4] = {}; - computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0); + computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0, 3); float axis[3] = {nsphere[0], nsphere[1], nsphere[2]}; float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]); @@ -1155,7 +1594,7 @@ meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, const float rzero = 0.f; float psphere[4] = {}; - computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0); + computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0, 7); bounds.center[0] = psphere[0]; bounds.center[1] = psphere[1]; diff --git a/3rdparty/meshoptimizer/src/vcacheanalyzer.cpp b/3rdparty/meshoptimizer/src/indexanalyzer.cpp similarity index 58% rename from 3rdparty/meshoptimizer/src/vcacheanalyzer.cpp rename to 3rdparty/meshoptimizer/src/indexanalyzer.cpp index 368274382..87ceeae66 100644 --- a/3rdparty/meshoptimizer/src/vcacheanalyzer.cpp +++ b/3rdparty/meshoptimizer/src/indexanalyzer.cpp @@ -71,3 +71,56 @@ meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* ind return result; } + +meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + meshopt_VertexFetchStatistics result = {}; + + unsigned char* vertex_visited = allocator.allocate(vertex_count); + memset(vertex_visited, 0, vertex_count); + + const size_t kCacheLine = 64; + const size_t kCacheSize = 128 * 1024; + + // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway + size_t cache[kCacheSize / kCacheLine] = {}; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + vertex_visited[index] = 1; + + size_t start_address = index * vertex_size; + size_t end_address = start_address + vertex_size; + + size_t start_tag = start_address / kCacheLine; + size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine; + + assert(start_tag < end_tag); + + for (size_t tag = start_tag; tag < end_tag; ++tag) + { + size_t line = tag % (sizeof(cache) / sizeof(cache[0])); + + // we store +1 since cache is filled with 0 by default + result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine; + cache[line] = tag + 1; + } + } + + size_t unique_vertex_count = 0; + + for (size_t i = 0; i < vertex_count; ++i) + unique_vertex_count += vertex_visited[i]; + + result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size); + + return result; +} diff --git a/3rdparty/meshoptimizer/src/indexcodec.cpp b/3rdparty/meshoptimizer/src/indexcodec.cpp index b4fdfe16d..7a8fd6867 100644 --- a/3rdparty/meshoptimizer/src/indexcodec.cpp +++ b/3rdparty/meshoptimizer/src/indexcodec.cpp @@ -210,6 +210,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons if (fer >= 0 && (fer >> 2) < 15) { + // note: getEdgeFifo implicitly rotates triangles by matching a/b to existing edge const unsigned int* order = kTriangleIndexOrder[fer & 3]; unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]]; @@ -267,6 +268,7 @@ size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, cons int fc = getVertexFifo(vertexfifo, c, vertexfifooffset); // after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a + // note: decoder implicitly assumes that if feb=fec=0, then fea=0 (reset code); this is enforced by rotation int fea = (a == next) ? (next++, 0) : 15; int feb = (fb >= 0 && fb < 14) ? fb + 1 : (b == next ? (next++, 0) : 15); int fec = (fc >= 0 && fc < 14) ? fc + 1 : (c == next ? (next++, 0) : 15); @@ -433,6 +435,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde // fifo reads are wrapped around 16 entry buffer unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0]; unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1]; + unsigned int c = 0; int fec = codetri & 15; @@ -442,37 +445,30 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde { // fifo reads are wrapped around 16 entry buffer unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15]; - unsigned int c = (fec == 0) ? next : cf; + c = (fec == 0) ? next : cf; int fec0 = fec == 0; next += fec0; - // output triangle - writeTriangle(destination, i, index_size, a, b, c); - - // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + // push vertex fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); - - pushEdgeFifo(edgefifo, c, b, edgefifooffset); - pushEdgeFifo(edgefifo, a, c, edgefifooffset); } else { - unsigned int c = 0; - // fec - (fec ^ 3) decodes 13, 14 into -1, 1 // note that we need to update the last index since free indices are delta-encoded last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last); - // output triangle - writeTriangle(destination, i, index_size, a, b, c); - // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly pushVertexFifo(vertexfifo, c, vertexfifooffset); - - pushEdgeFifo(edgefifo, c, b, edgefifooffset); - pushEdgeFifo(edgefifo, a, c, edgefifooffset); } + + // push edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); } else { diff --git a/3rdparty/meshoptimizer/src/indexgenerator.cpp b/3rdparty/meshoptimizer/src/indexgenerator.cpp index 0d53020e3..5f36d2463 100644 --- a/3rdparty/meshoptimizer/src/indexgenerator.cpp +++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp @@ -5,6 +5,7 @@ #include // This work is based on: +// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 // John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010 // John Hable. Variable Rate Shading with Visibility Buffer Rendering. 2024 namespace meshopt @@ -86,6 +87,46 @@ struct VertexStreamHasher } }; +struct VertexCustomHasher +{ + const float* vertex_positions; + size_t vertex_stride_float; + + int (*callback)(void*, unsigned int, unsigned int); + void* context; + + size_t hash(unsigned int index) const + { + const unsigned int* key = reinterpret_cast(vertex_positions + index * vertex_stride_float); + + unsigned int x = key[0], y = key[1], z = key[2]; + + // replace negative zero with zero + x = (x == 0x80000000) ? 0 : x; + y = (y == 0x80000000) ? 0 : y; + z = (z == 0x80000000) ? 0 : z; + + // scramble bits to make sure that integer coordinates have entropy in lower bits + x ^= x >> 17; + y ^= y >> 17; + z ^= z >> 17; + + // Optimized Spatial Hashing for Collision Detection of Deformable Objects + return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791); + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + const float* lp = vertex_positions + lhs * vertex_stride_float; + const float* rp = vertex_positions + rhs * vertex_stride_float; + + if (lp[0] != rp[0] || lp[1] != rp[1] || lp[2] != rp[2]) + return false; + + return callback ? callback(context, lhs, rhs) : true; + } +}; + struct EdgeHasher { const unsigned int* remap; @@ -183,6 +224,43 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position allocator.deallocate(vertex_table); } +template +static size_t generateVertexRemap(unsigned int* remap, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator) +{ + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices ? indices[i] : unsigned(i); + assert(index < vertex_count); + + if (remap[index] != ~0u) + continue; + + unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u); + + if (*entry == ~0u) + { + *entry = index; + remap[index] = next_vertex++; + } + else + { + assert(remap[*entry] != ~0u); + remap[index] = remap[*entry]; + } + } + + assert(next_vertex <= vertex_count); + return next_vertex; +} + template static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) { @@ -197,6 +275,35 @@ static void remapVertices(void* destination, const void* vertices, size_t vertex } } +template +static void generateShadowBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const Hash& hash, meshopt_Allocator& allocator) +{ + unsigned int* remap = allocator.allocate(vertex_count); + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + if (remap[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hash, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + destination[i] = remap[index]; + } +} + } // namespace meshopt size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) @@ -208,44 +315,9 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int assert(vertex_size > 0 && vertex_size <= 256); meshopt_Allocator allocator; - - memset(destination, -1, vertex_count * sizeof(unsigned int)); - VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_size}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); - - unsigned int next_vertex = 0; - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices ? indices[i] : unsigned(i); - assert(index < vertex_count); - - if (destination[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); - - if (*entry == ~0u) - { - *entry = index; - - destination[index] = next_vertex++; - } - else - { - assert(destination[*entry] != ~0u); - - destination[index] = destination[*entry]; - } - } - } - - assert(next_vertex <= vertex_count); - - return next_vertex; + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); } size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) @@ -263,44 +335,24 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne } meshopt_Allocator allocator; - - memset(destination, -1, vertex_count * sizeof(unsigned int)); - VertexStreamHasher hasher = {streams, stream_count}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); +} - unsigned int next_vertex = 0; +size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context) +{ + using namespace meshopt; - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices ? indices[i] : unsigned(i); - assert(index < vertex_count); + assert(indices || index_count == vertex_count); + assert(!indices || index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); - if (destination[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + meshopt_Allocator allocator; + VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), callback, context}; - if (*entry == ~0u) - { - *entry = index; - - destination[index] = next_vertex++; - } - else - { - assert(destination[*entry] != ~0u); - - destination[index] = destination[*entry]; - } - } - } - - assert(next_vertex <= vertex_count); - - return next_vertex; + return generateVertexRemap(destination, indices, index_count, vertex_count, hasher, allocator); } void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) @@ -362,33 +414,9 @@ void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned assert(vertex_size <= vertex_stride); meshopt_Allocator allocator; - - unsigned int* remap = allocator.allocate(vertex_count); - memset(remap, -1, vertex_count * sizeof(unsigned int)); - VertexHasher hasher = {static_cast(vertices), vertex_size, vertex_stride}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices[i]; - assert(index < vertex_count); - - if (remap[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); - - if (*entry == ~0u) - *entry = index; - - remap[index] = *entry; - } - - destination[i] = remap[index]; - } + generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator); } void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) @@ -406,33 +434,9 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns } meshopt_Allocator allocator; - - unsigned int* remap = allocator.allocate(vertex_count); - memset(remap, -1, vertex_count * sizeof(unsigned int)); - VertexStreamHasher hasher = {streams, stream_count}; - size_t table_size = hashBuckets(vertex_count); - unsigned int* table = allocator.allocate(table_size); - memset(table, -1, table_size * sizeof(unsigned int)); - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices[i]; - assert(index < vertex_count); - - if (remap[index] == ~0u) - { - unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); - - if (*entry == ~0u) - *entry = index; - - remap[index] = *entry; - } - - destination[i] = remap[index]; - } + generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator); } void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index 295324c78..1808604ce 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -74,6 +74,19 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, */ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); +/** + * Experimental: Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices + * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence. + * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided). + * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * indices can be NULL if the input is unindexed + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context); + /** * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap * @@ -291,8 +304,9 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, si * The default compression level implied by meshopt_encodeVertexBuffer is 2. * * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. + * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0. */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version); /** * Set vertex encoder format version @@ -425,6 +439,19 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio */ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); +/** + * Experimental: Mesh simplifier (pruner) + * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh + * Returns the number of indices after simplification, with destination containing new index data + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * + * destination must contain enough space for the target index buffer, worst case is index_count elements + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); + /** * Point cloud simplifier * Reduces the number of points in the cloud to reach the given target @@ -485,6 +512,19 @@ struct meshopt_VertexCacheStatistics */ MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size); +struct meshopt_VertexFetchStatistics +{ + unsigned int bytes_fetched; + float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */ +}; + +/** + * Vertex fetch cache analyzer + * Returns cache hit statistics using a simplified direct mapped model + * Results may not match actual GPU performance + */ +MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size); + struct meshopt_OverdrawStatistics { unsigned int pixels_covered; @@ -501,18 +541,19 @@ struct meshopt_OverdrawStatistics */ MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); -struct meshopt_VertexFetchStatistics +struct meshopt_CoverageStatistics { - unsigned int bytes_fetched; - float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */ + float coverage[3]; + float extent; /* viewport size in mesh coordinates */ }; /** - * Vertex fetch cache analyzer - * Returns cache hit statistics using a simplified direct mapped model - * Results may not match actual GPU performance + * Experimental: Coverage analyzer + * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_EXPERIMENTAL struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Meshlet is a small mesh cluster (subset) that consists of: @@ -567,6 +608,19 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m */ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); +/** + * Experimental: Meshlet builder that produces clusters optimized for raytracing + * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet. + * + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) + * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices + * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) + * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); + /** * Meshlet optimizer * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput @@ -722,6 +776,10 @@ template inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); template inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count); +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback); +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback); template inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap); template @@ -767,9 +825,11 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_ template inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size); template +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +template inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template -inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); template @@ -777,6 +837,8 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* template inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); template +inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size); @@ -930,6 +992,30 @@ inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const return meshopt_generateVertexRemapMulti(destination, indices ? in.data : NULL, index_count, vertex_count, streams, stream_count); } +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback) +{ + struct Call + { + static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; } + }; + + return meshopt_generateVertexRemapCustom(destination, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback); +} + +template +inline size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, F callback) +{ + struct Call + { + static int compare(void* context, unsigned int lhs, unsigned int rhs) { return (*static_cast(context))(lhs, rhs) ? 1 : 0; } + }; + + meshopt_IndexAdapter in(NULL, indices, indices ? index_count : 0); + + return meshopt_generateVertexRemapCustom(destination, indices ? in.data : NULL, index_count, vertex_positions, vertex_count, vertex_positions_stride, &Call::compare, &callback); +} + template inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap) { @@ -1127,6 +1213,14 @@ inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size); } +template +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); +} + template inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { @@ -1136,11 +1230,11 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size } template -inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +inline meshopt_CoverageStatistics meshopt_analyzeCoverage(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); + return meshopt_analyzeCoverage(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } template @@ -1167,6 +1261,14 @@ inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor); } +template +inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_buildMeshletsSplit(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight); +} + template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { diff --git a/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp b/3rdparty/meshoptimizer/src/rasterizer.cpp similarity index 62% rename from 3rdparty/meshoptimizer/src/overdrawanalyzer.cpp rename to 3rdparty/meshoptimizer/src/rasterizer.cpp index 31cf6f146..bd788ffdb 100644 --- a/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp +++ b/3rdparty/meshoptimizer/src/rasterizer.cpp @@ -18,14 +18,6 @@ struct OverdrawBuffer unsigned int overdraw[kViewport][kViewport][2]; }; -#ifndef min -#define min(a, b) ((a) < (b) ? (a) : (b)) -#endif - -#ifndef max -#define max(a, b) ((a) > (b) ? (a) : (b)) -#endif - static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3) { // z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1) @@ -36,8 +28,8 @@ static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1); float invdet = (det == 0) ? 0 : 1 / det; - dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet; - dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet; + dzdx = ((z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1)) * invdet; + dzdy = ((x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1)) * invdet; return det; } @@ -76,11 +68,26 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f // bounding rectangle, clipped against viewport // since we rasterize pixels with covered centers, min >0.5 should round up // as for max, due to top-left filling convention we will never rasterize right/bottom edges - // so max >= 0.5 should round down - int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0); - int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport); - int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0); - int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport); + // so max >= 0.5 should round down for inclusive bounds, and up for exclusive (in our case) + int minx = X1 < X2 ? X1 : X2; + minx = minx < X3 ? minx : X3; + minx = (minx + 7) >> 4; + minx = minx < 0 ? 0 : minx; + + int miny = Y1 < Y2 ? Y1 : Y2; + miny = miny < Y3 ? miny : Y3; + miny = (miny + 7) >> 4; + miny = miny < 0 ? 0 : miny; + + int maxx = X1 > X2 ? X1 : X2; + maxx = maxx > X3 ? maxx : X3; + maxx = (maxx + 7) >> 4; + maxx = maxx > kViewport ? kViewport : maxx; + + int maxy = Y1 > Y2 ? Y1 : Y2; + maxy = maxy > Y3 ? maxy : Y3; + maxy = (maxy + 7) >> 4; + maxy = maxy > kViewport ? kViewport : maxy; // deltas, 28.4 fixed point int DX12 = X1 - X2; @@ -139,22 +146,10 @@ static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, f } } -} // namespace meshopt - -meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +static float transformTriangles(float* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { - using namespace meshopt; - - assert(index_count % 3 == 0); - assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); - - meshopt_Allocator allocator; - size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - meshopt_OverdrawStatistics result = {}; - float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; @@ -164,15 +159,20 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, for (int j = 0; j < 3; ++j) { - minv[j] = min(minv[j], v[j]); - maxv[j] = max(maxv[j], v[j]); + float vj = v[j]; + + minv[j] = minv[j] > vj ? vj : minv[j]; + maxv[j] = maxv[j] < vj ? vj : maxv[j]; } } - float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2])); - float scale = kViewport / extent; + float extent = 0.f; - float* triangles = allocator.allocate(index_count * 3); + extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]); + extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); + extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); + + float scale = kViewport / extent; for (size_t i = 0; i < index_count; ++i) { @@ -186,31 +186,55 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, triangles[i * 3 + 2] = (v[2] - minv[2]) * scale; } + return extent; +} + +static void rasterizeTriangles(OverdrawBuffer* buffer, const float* triangles, size_t index_count, int axis) +{ + for (size_t i = 0; i < index_count; i += 3) + { + const float* vn0 = &triangles[3 * (i + 0)]; + const float* vn1 = &triangles[3 * (i + 1)]; + const float* vn2 = &triangles[3 * (i + 2)]; + + switch (axis) + { + case 0: + rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]); + break; + case 1: + rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]); + break; + case 2: + rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]); + break; + } + } +} + +} // namespace meshopt + +meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + meshopt_OverdrawStatistics result = {}; + + float* triangles = allocator.allocate(index_count * 3); + transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); + OverdrawBuffer* buffer = allocator.allocate(1); for (int axis = 0; axis < 3; ++axis) { memset(buffer, 0, sizeof(OverdrawBuffer)); - - for (size_t i = 0; i < index_count; i += 3) - { - const float* vn0 = &triangles[3 * (i + 0)]; - const float* vn1 = &triangles[3 * (i + 1)]; - const float* vn2 = &triangles[3 * (i + 2)]; - - switch (axis) - { - case 0: - rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]); - break; - case 1: - rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]); - break; - case 2: - rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]); - break; - } - } + rasterizeTriangles(buffer, triangles, index_count, axis); for (int y = 0; y < kViewport; ++y) for (int x = 0; x < kViewport; ++x) @@ -227,3 +251,39 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, return result; } + +meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + meshopt_CoverageStatistics result = {}; + + float* triangles = allocator.allocate(index_count * 3); + float extent = transformTriangles(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride); + + OverdrawBuffer* buffer = allocator.allocate(1); + + for (int axis = 0; axis < 3; ++axis) + { + memset(buffer, 0, sizeof(OverdrawBuffer)); + rasterizeTriangles(buffer, triangles, index_count, axis); + + unsigned int covered = 0; + + for (int y = 0; y < kViewport; ++y) + for (int x = 0; x < kViewport; ++x) + covered += (buffer->overdraw[y][x][0] | buffer->overdraw[y][x][1]) > 0; + + result.coverage[axis] = float(covered) / float(kViewport * kViewport); + } + + result.extent = extent; + + return result; +} diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp index cf0a8a187..1abcd51f5 100644 --- a/3rdparty/meshoptimizer/src/simplifier.cpp +++ b/3rdparty/meshoptimizer/src/simplifier.cpp @@ -118,10 +118,17 @@ struct PositionHasher unsigned int ri = sparse_remap ? sparse_remap[index] : index; const unsigned int* key = reinterpret_cast(vertex_positions + ri * vertex_stride_float); + unsigned int x = key[0], y = key[1], z = key[2]; + + // replace negative zero with zero + x = (x == 0x80000000) ? 0 : x; + y = (y == 0x80000000) ? 0 : y; + z = (z == 0x80000000) ? 0 : z; + // scramble bits to make sure that integer coordinates have entropy in lower bits - unsigned int x = key[0] ^ (key[0] >> 17); - unsigned int y = key[1] ^ (key[1] >> 17); - unsigned int z = key[2] ^ (key[2] >> 17); + x ^= x >> 17; + y ^= y >> 17; + z ^= z >> 17; // Optimized Spatial Hashing for Collision Detection of Deformable Objects return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791); @@ -132,7 +139,10 @@ struct PositionHasher unsigned int li = sparse_remap ? sparse_remap[lhs] : lhs; unsigned int ri = sparse_remap ? sparse_remap[rhs] : rhs; - return memcmp(vertex_positions + li * vertex_stride_float, vertex_positions + ri * vertex_stride_float, sizeof(float) * 3) == 0; + const float* lv = vertex_positions + li * vertex_stride_float; + const float* rv = vertex_positions + ri * vertex_stride_float; + + return lv[0] == rv[0] && lv[1] == rv[1] && lv[2] == rv[2]; } }; @@ -208,6 +218,11 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f remap[index] = *entry; } + allocator.deallocate(table); + + if (!wedge) + return; + // build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex? // entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i for (size_t i = 0; i < vertex_count; ++i) @@ -221,8 +236,6 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f wedge[i] = wedge[r]; wedge[r] = unsigned(i); } - - allocator.deallocate(table); } static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, size_t vertex_count, size_t* out_vertex_count, meshopt_Allocator& allocator) @@ -1862,6 +1875,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic updateEdgeAdjacency(adjacency, result, index_count, vertex_count, NULL); // build position remap that maps each vertex to the one with identical position + // wedge table stores next vertex with identical position for each vertex unsigned int* remap = allocator.allocate(vertex_count); unsigned int* wedge = allocator.allocate(vertex_count); buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, allocator); @@ -2216,6 +2230,40 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind return write; } +size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float target_error) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_error >= 0); + + meshopt_Allocator allocator; + + unsigned int* result = destination; + if (result != indices) + memcpy(result, indices, index_count * sizeof(unsigned int)); + + // build position remap that maps each vertex to the one with identical position + unsigned int* remap = allocator.allocate(vertex_count); + buildPositionRemap(remap, NULL, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, allocator); + + Vector3* vertex_positions = allocator.allocate(vertex_count); + rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, NULL); + + unsigned int* components = allocator.allocate(vertex_count); + size_t component_count = buildComponents(components, vertex_count, indices, index_count, remap); + + float* component_errors = allocator.allocate(component_count * 4); // overallocate for temporary use inside measureComponents + measureComponents(component_errors, component_count, components, vertex_positions, vertex_count); + + float component_nexterror = 0; + size_t result_count = pruneComponents(result, index_count, components, component_errors, component_count, target_error * target_error, component_nexterror); + + return result_count; +} + size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count) { using namespace meshopt; diff --git a/3rdparty/meshoptimizer/src/spatialorder.cpp b/3rdparty/meshoptimizer/src/spatialorder.cpp index 7b1a06945..43aa7bcaf 100644 --- a/3rdparty/meshoptimizer/src/spatialorder.cpp +++ b/3rdparty/meshoptimizer/src/spatialorder.cpp @@ -10,18 +10,19 @@ namespace meshopt { -// "Insert" two 0 bits after each of the 10 low bits of x -inline unsigned int part1By2(unsigned int x) +// "Insert" two 0 bits after each of the 20 low bits of x +inline unsigned long long part1By2(unsigned long long x) { - x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210 - x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210 - x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210 - x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10 - x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 + x &= 0x000fffffull; // x = ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- jihg fedc ba98 7654 3210 + x = (x ^ (x << 32)) & 0x000f00000000ffffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- ---- ---- ---- ---- fedc ba98 7654 3210 + x = (x ^ (x << 16)) & 0x000f0000ff0000ffull; // x = ---- ---- ---- jihg ---- ---- ---- ---- fedc ba98 ---- ---- ---- ---- 7654 3210 + x = (x ^ (x << 8)) & 0x000f00f00f00f00full; // x = ---- ---- ---- jihg ---- ---- fedc ---- ---- ba98 ---- ---- 7654 ---- ---- 3210 + x = (x ^ (x << 4)) & 0x00c30c30c30c30c3ull; // x = ---- ---- ji-- --hg ---- fe-- --dc ---- ba-- --98 ---- 76-- --54 ---- 32-- --10 + x = (x ^ (x << 2)) & 0x0249249249249249ull; // x = ---- --j- -i-- h--g --f- -e-- d--c --b- -a-- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 return x; } -static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -47,61 +48,68 @@ static void computeOrder(unsigned int* result, const float* vertex_positions_dat extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); - float scale = extent == 0 ? 0.f : 1.f / extent; + // rescale each axis to 16 bits to get 48-bit Morton codes + float scale = extent == 0 ? 0.f : 65535.f / extent; // generate Morton order based on the position inside a unit cube for (size_t i = 0; i < vertex_count; ++i) { const float* v = vertex_positions_data + i * vertex_stride_float; - int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f); - int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f); - int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f); + int x = int((v[0] - minv[0]) * scale + 0.5f); + int y = int((v[1] - minv[1]) * scale + 0.5f); + int z = int((v[2] - minv[2]) * scale + 0.5f); result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); } } -static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count) +static void computeHistogram(unsigned int (&hist)[1024][5], const unsigned long long* data, size_t count) { memset(hist, 0, sizeof(hist)); - // compute 3 10-bit histograms in parallel + // compute 5 10-bit histograms in parallel for (size_t i = 0; i < count; ++i) { - unsigned int id = data[i]; + unsigned long long id = data[i]; hist[(id >> 0) & 1023][0]++; hist[(id >> 10) & 1023][1]++; hist[(id >> 20) & 1023][2]++; + hist[(id >> 30) & 1023][3]++; + hist[(id >> 40) & 1023][4]++; } - unsigned int sumx = 0, sumy = 0, sumz = 0; + unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0; // replace histogram data with prefix histogram sums in-place for (int i = 0; i < 1024; ++i) { - unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; + unsigned int h0 = hist[i][0], h1 = hist[i][1], h2 = hist[i][2], h3 = hist[i][3], h4 = hist[i][4]; - hist[i][0] = sumx; - hist[i][1] = sumy; - hist[i][2] = sumz; + hist[i][0] = sum0; + hist[i][1] = sum1; + hist[i][2] = sum2; + hist[i][3] = sum3; + hist[i][4] = sum4; - sumx += hx; - sumy += hy; - sumz += hz; + sum0 += h0; + sum1 += h1; + sum2 += h2; + sum3 += h3; + sum4 += h4; } - assert(sumx == count && sumy == count && sumz == count); + assert(sum0 == count && sum1 == count && sum2 == count && sum3 == count && sum4 == count); } -static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass) +static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned long long* keys, size_t count, unsigned int (&hist)[1024][5], int pass) { int bitoff = pass * 10; for (size_t i = 0; i < count; ++i) { - unsigned int id = (keys[source[i]] >> bitoff) & 1023; + unsigned int id = unsigned(keys[source[i]] >> bitoff) & 1023; destination[hist[id][pass]++] = source[i]; } @@ -118,10 +126,10 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos meshopt_Allocator allocator; - unsigned int* keys = allocator.allocate(vertex_count); + unsigned long long* keys = allocator.allocate(vertex_count); computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride); - unsigned int hist[1024][3]; + unsigned int hist[1024][5]; computeHistogram(hist, keys, vertex_count); unsigned int* scratch = allocator.allocate(vertex_count); @@ -129,10 +137,12 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos for (size_t i = 0; i < vertex_count; ++i) destination[i] = unsigned(i); - // 3-pass radix sort computes the resulting order into scratch + // 5-pass radix sort computes the resulting order into scratch radixPass(scratch, destination, keys, vertex_count, hist, 0); radixPass(destination, scratch, keys, vertex_count, hist, 1); radixPass(scratch, destination, keys, vertex_count, hist, 2); + radixPass(destination, scratch, keys, vertex_count, hist, 3); + radixPass(scratch, destination, keys, vertex_count, hist, 4); // since our remap table is mapping old=>new, we need to reverse it for (size_t i = 0; i < vertex_count; ++i) diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index 53cf9d753..847589b3d 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -1643,13 +1643,16 @@ static unsigned int cpuid = getCpuFeatures(); } // namespace meshopt -size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level) +size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version) { using namespace meshopt; assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); assert(level >= 0 && level <= 9); // only a subset of this range is used right now + assert(version < 0 || unsigned(version) <= kDecodeVertexVersion); + + version = version < 0 ? gEncodeVertexVersion : version; #if TRACE memset(vertexstats, 0, sizeof(vertexstats)); @@ -1663,8 +1666,6 @@ size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size if (size_t(data_end - data) < 1) return 0; - int version = gEncodeVertexVersion; - *data++ = (unsigned char)(kVertexHeader | version); unsigned char first_vertex[256] = {}; @@ -1777,7 +1778,7 @@ size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) { - return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel); + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel, meshopt::gEncodeVertexVersion); } size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) diff --git a/3rdparty/meshoptimizer/src/vertexfilter.cpp b/3rdparty/meshoptimizer/src/vertexfilter.cpp index 1f456bb9d..6be6b723f 100644 --- a/3rdparty/meshoptimizer/src/vertexfilter.cpp +++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp @@ -201,7 +201,7 @@ inline uint64_t rotateleft64(uint64_t v, int x) #endif #ifdef SIMD_SSE -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const __m128 sign = _mm_set1_ps(-0.f); @@ -246,7 +246,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count) } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const __m128 sign = _mm_set1_ps(-0.f); @@ -295,8 +295,9 @@ static void decodeFilterOctSimd(short* data, size_t count) __m128i res_1 = _mm_unpackhi_epi16(xzr, y0r); // patch in .w - res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000))); - res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000))); + __m128i maskw = _mm_set_epi32(0xffff0000, 0, 0xffff0000, 0); + res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), maskw)); + res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), maskw)); _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0); _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1); @@ -404,7 +405,7 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y) #endif #ifdef SIMD_NEON -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const int32x4_t sign = vdupq_n_s32(0x80000000); @@ -453,7 +454,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count) } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const int32x4_t sign = vdupq_n_s32(0x80000000); @@ -598,7 +599,7 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) #endif #ifdef SIMD_WASM -static void decodeFilterOctSimd(signed char* data, size_t count) +static void decodeFilterOctSimd8(signed char* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); @@ -647,7 +648,7 @@ static void decodeFilterOctSimd(signed char* data, size_t count) } } -static void decodeFilterOctSimd(short* data, size_t count) +static void decodeFilterOctSimd16(short* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); const v128_t zmask = wasm_i32x4_splat(0x7fff); @@ -833,9 +834,9 @@ void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride) #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) if (stride == 4) - dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); + dispatchSimd(decodeFilterOctSimd8, static_cast(buffer), count, 4); else - dispatchSimd(decodeFilterOctSimd, static_cast(buffer), count, 4); + dispatchSimd(decodeFilterOctSimd16, static_cast(buffer), count, 4); #else if (stride == 4) decodeFilterOct(static_cast(buffer), count); diff --git a/3rdparty/meshoptimizer/src/vfetchanalyzer.cpp b/3rdparty/meshoptimizer/src/vfetchanalyzer.cpp deleted file mode 100644 index 51dca873f..000000000 --- a/3rdparty/meshoptimizer/src/vfetchanalyzer.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details -#include "meshoptimizer.h" - -#include -#include - -meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size) -{ - assert(index_count % 3 == 0); - assert(vertex_size > 0 && vertex_size <= 256); - - meshopt_Allocator allocator; - - meshopt_VertexFetchStatistics result = {}; - - unsigned char* vertex_visited = allocator.allocate(vertex_count); - memset(vertex_visited, 0, vertex_count); - - const size_t kCacheLine = 64; - const size_t kCacheSize = 128 * 1024; - - // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway - size_t cache[kCacheSize / kCacheLine] = {}; - - for (size_t i = 0; i < index_count; ++i) - { - unsigned int index = indices[i]; - assert(index < vertex_count); - - vertex_visited[index] = 1; - - size_t start_address = index * vertex_size; - size_t end_address = start_address + vertex_size; - - size_t start_tag = start_address / kCacheLine; - size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine; - - assert(start_tag < end_tag); - - for (size_t tag = start_tag; tag < end_tag; ++tag) - { - size_t line = tag % (sizeof(cache) / sizeof(cache[0])); - - // we store +1 since cache is filled with 0 by default - result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine; - cache[line] = tag + 1; - } - } - - size_t unique_vertex_count = 0; - - for (size_t i = 0; i < vertex_count; ++i) - unique_vertex_count += vertex_visited[i]; - - result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size); - - return result; -}