From c24efd27c940462eb651cbf4c166cfe1eb81dbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?= =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?= Date: Sun, 14 Sep 2025 09:01:40 -0700 Subject: [PATCH] Updated meshoptimizer. --- 3rdparty/meshoptimizer/src/allocator.cpp | 13 +- 3rdparty/meshoptimizer/src/clusterizer.cpp | 266 ++++--- 3rdparty/meshoptimizer/src/indexgenerator.cpp | 25 + 3rdparty/meshoptimizer/src/meshoptimizer.h | 246 +++++-- .../meshoptimizer/src/overdrawoptimizer.cpp | 14 +- 3rdparty/meshoptimizer/src/partition.cpp | 2 + 3rdparty/meshoptimizer/src/simplifier.cpp | 651 +++++++++++++++--- 3rdparty/meshoptimizer/src/spatialorder.cpp | 206 +++++- 3rdparty/meshoptimizer/src/vertexcodec.cpp | 2 +- 3rdparty/meshoptimizer/src/vertexfilter.cpp | 426 +++++++++++- 10 files changed, 1556 insertions(+), 295 deletions(-) diff --git a/3rdparty/meshoptimizer/src/allocator.cpp b/3rdparty/meshoptimizer/src/allocator.cpp index b8cb33c28..6b6083da2 100644 --- a/3rdparty/meshoptimizer/src/allocator.cpp +++ b/3rdparty/meshoptimizer/src/allocator.cpp @@ -1,8 +1,17 @@ // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details #include "meshoptimizer.h" +#ifdef MESHOPTIMIZER_ALLOC_EXPORT +meshopt_Allocator::Storage& meshopt_Allocator::storage() +{ + static Storage s = {::operator new, ::operator delete }; + return s; +} +#endif + void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)) { - meshopt_Allocator::Storage::allocate = allocate; - meshopt_Allocator::Storage::deallocate = deallocate; + meshopt_Allocator::Storage& s = meshopt_Allocator::storage(); + s.allocate = allocate; + s.deallocate = deallocate; } diff --git a/3rdparty/meshoptimizer/src/clusterizer.cpp b/3rdparty/meshoptimizer/src/clusterizer.cpp index f3bbbd790..8dd6fb54d 100644 --- a/3rdparty/meshoptimizer/src/clusterizer.cpp +++ b/3rdparty/meshoptimizer/src/clusterizer.cpp @@ -6,11 +6,23 @@ #include #include +// The block below auto-detects SIMD ISA that can be used on the target platform +#ifndef MESHOPTIMIZER_NO_SIMD +#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64)) +#define SIMD_SSE +#include +#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922) +#define SIMD_NEON +#include +#endif +#endif // !MESHOPTIMIZER_NO_SIMD + // This work is based on: // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 // Jack Ritter. An Efficient Bounding Sphere. 1990 // Thomas Larsson. Fast and Tight Fitting Bounding Spheres. 2008 +// Ingo Wald, Vlastimil Havran. On building fast kd-Trees for Ray Tracing, and on doing that in O(N log N). 2006 namespace meshopt { @@ -148,6 +160,19 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un } } +static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count) +{ + // for sparse inputs, it's faster to only clear vertices referenced by the index buffer + if (vertex_count <= index_count) + memset(used, -1, vertex_count * sizeof(short)); + else + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + used[indices[i]] = -1; + } +} + static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { static const float kAxes[7][3] = { @@ -264,21 +289,8 @@ struct Cone float nx, ny, nz; }; -static float getDistance(float dx, float dy, float dz, bool aa) -{ - if (!aa) - return sqrtf(dx * dx + dy * dy + dz * dz); - - float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz); - float rxy = rx > ry ? rx : ry; - return rxy > rz ? rxy : rz; -} - static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius) { - if (cone_weight < 0) - return 1 + distance / expected_radius; - float cone = 1.f - spread * cone_weight; float cone_clamped = cone < 1e-3f ? 1e-3f : cone; @@ -347,15 +359,6 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices, return mesh_area; } -static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles) -{ - size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3; - - // fill 4b padding with 0 - while (offset & 3) - meshlet_triangles[offset++] = 0; -} - static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false) { short& av = used[a]; @@ -373,10 +376,8 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int for (size_t j = 0; j < meshlet.vertex_count; ++j) used[meshlet_vertices[meshlet.vertex_offset + j]] = -1; - finishMeshlet(meshlet, meshlet_triangles); - meshlet.vertex_offset += meshlet.vertex_count; - meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding + meshlet.triangle_offset += meshlet.triangle_count * 3; meshlet.vertex_count = 0; meshlet.triangle_count = 0; @@ -452,7 +453,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co const Cone& tri_cone = triangles[triangle]; float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz; - float distance = getDistance(dx, dy, dz, cone_weight < 0); + float distance = sqrtf(dx * dx + dy * dy + dz * dz); float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius); @@ -513,7 +514,8 @@ static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& me if (best_neighbor == ~0u) continue; - float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false); + float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz; + float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz); for (size_t j = 0; j < kMeshletAddSeeds; ++j) { @@ -565,7 +567,8 @@ static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_co unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; - float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false); + float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); if (live < best_live || (live == best_live && score < best_score)) { @@ -586,7 +589,7 @@ struct KDNode unsigned int index; }; - // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point) + // leaves: axis = 3, children = number of points including this one // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children unsigned int axis : 2; unsigned int children : 30; @@ -622,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u result.index = indices[0]; result.axis = 3; - result.children = unsigned(count - 1); + result.children = unsigned(count); // all remaining points are stored in nodes immediately following the leaf for (size_t i = 1; i < count; ++i) @@ -681,29 +684,37 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size); // distance to the right subtree is represented explicitly + assert(next_offset - offset > 1); result.children = unsigned(next_offset - offset - 1); return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); } -static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit) +static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) { const KDNode& node = nodes[root]; + if (node.children == 0) + return; + if (node.axis == 3) { // leaf - for (unsigned int i = 0; i <= node.children; ++i) + bool inactive = true; + + for (unsigned int i = 0; i < node.children; ++i) { unsigned int index = nodes[root + i].index; if (emitted_flags[index]) continue; + inactive = false; + const float* point = points + index * stride; float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2]; - float distance = getDistance(dx, dy, dz, aa); + float distance = sqrtf(dx * dx + dy * dy + dz * dz); if (distance < limit) { @@ -711,6 +722,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, limit = distance; } } + + // deactivate leaves that no longer have items to emit + if (inactive) + nodes[root].children = 0; } else { @@ -719,34 +734,84 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, unsigned int first = (delta <= 0) ? 0 : node.children; unsigned int second = first ^ node.children; - kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit); + // deactivate branches that no longer have items to emit to accelerate traversal + // note that we do this *before* recursing which delays deactivation but keeps tail calls + if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0) + nodes[root].children = 0; + + kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); // only process the other node if it can have a match based on closest distance so far if (fabsf(delta) <= limit) - kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit); + kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit); } } +struct BVHBoxT +{ + float min[4]; + float max[4]; +}; + struct BVHBox { float min[3]; float max[3]; }; -static void boxMerge(BVHBox& box, const BVHBox& other) +#if defined(SIMD_SSE) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + __m128 min = _mm_loadu_ps(box.min); + __m128 max = _mm_loadu_ps(box.max); + + min = _mm_min_ps(min, _mm_loadu_ps(other.min)); + max = _mm_max_ps(max, _mm_loadu_ps(other.max)); + + _mm_store_ps(box.min, min); + _mm_store_ps(box.max, max); + + __m128 size = _mm_sub_ps(max, min); + __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1)); + __m128 mul = _mm_mul_ps(size, size_yzx); + __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1))); + __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2))); + + return _mm_cvtss_f32(sum_xyz); +} +#elif defined(SIMD_NEON) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + float32x4_t min = vld1q_f32(box.min); + float32x4_t max = vld1q_f32(box.max); + + min = vminq_f32(min, vld1q_f32(other.min)); + max = vmaxq_f32(max, vld1q_f32(other.max)); + + vst1q_f32(box.min, min); + vst1q_f32(box.max, max); + + float32x4_t size = vsubq_f32(max, min); + float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2); + float32x4_t mul = vmulq_f32(size, size_yzx); + float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1); + float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2); + + return sum_xyz; +} +#else +static float boxMerge(BVHBoxT& box, const BVHBox& other) { for (int k = 0; k < 3; ++k) { box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k]; box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k]; } -} -inline float boxSurface(const BVHBox& box) -{ float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2]; return sx * sy + sx * sz + sy * sz; } +#endif inline unsigned int radixFloat(unsigned int v) { @@ -832,7 +897,7 @@ static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indi } } -static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices) +static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL) { // count number of unique vertices size_t used_vertices = 0; @@ -843,6 +908,9 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0); used[a] = used[b] = used[c] = 1; + + if (out) + out[i] = unsigned(used_vertices); } // reset used[] for future invocations @@ -854,16 +922,16 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size used[a] = used[b] = used[c] = -1; } - if (used_vertices > max_vertices) - return false; + return used_vertices; +} +static void bvhPackLeaf(unsigned char* boundary, size_t count) +{ // mark meshlet boundary for future reassembly assert(count > 0); boundary[0] = 1; memset(boundary + 1, 0, count - 1); - - return true; } static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles) @@ -872,8 +940,9 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size { size_t chunk = i + max_triangles <= count ? max_triangles : count - i; - if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices)) + if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices) { + bvhPackLeaf(boundary + i, chunk); i += chunk; continue; } @@ -881,7 +950,7 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size // chunk is vertex bound, split it into smaller meshlets assert(chunk > max_vertices / 3); - bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices); + bvhPackLeaf(boundary + i, max_vertices / 3); i += max_vertices / 3; } } @@ -889,30 +958,32 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size static bool bvhDivisible(size_t count, size_t min, size_t max) { // count is representable as a sum of values in [min..max] if if it in range of [k*min..k*min+k*(max-min)] - // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv + // equivalent to ceil(count / max) <= floor(count / min), but the form below allows using idiv (see nv_cluster_builder) // we avoid expensive integer divisions in the common case where min is <= max/2 return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min); } -static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost) +static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count) { - BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]]; - float* costs = static_cast(scratch); + BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}}; + BVHBoxT accumr = accuml; - // accumulate SAH cost in forward and backward directions for (size_t i = 0; i < count; ++i) { - boxMerge(accuml, boxes[order[i]]); - boxMerge(accumr, boxes[order[count - 1 - i]]); + float larea = boxMerge(accuml, boxes[order[i]]); + float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]); - costs[i] = boxSurface(accuml); - costs[i + count] = boxSurface(accumr); + areas[i] = larea; + areas[i + count] = rarea; } +} +static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost) +{ bool aligned = count >= min * 2 && bvhDivisible(count, min, max); size_t end = aligned ? count - min : count - 1; - float rmaxf = 1.f / float(int(max)); + float rmaxfill = 1.f / float(int(maxfill)); // find best split that minimizes SAH size_t bestsplit = 0; @@ -927,17 +998,22 @@ static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t co if (aligned && !bvhDivisible(rsplit, min, max)) continue; - // costs[x] = inclusive surface area of boxes[0..x] - // costs[count-1-x] = inclusive surface area of boxes[x..count-1] - float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count]; + // areas[x] = inclusive surface area of boxes[0..x] + // areas[count-1-x] = inclusive surface area of boxes[x..count-1] + float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count]; float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit)); if (cost > bestcost) continue; - // fill cost; use floating point math to avoid expensive integer modulo - int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit); - int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit); + // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count + // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice + size_t lfill = vertices ? vertices[i] : lsplit; + size_t rfill = vertices ? vertices[i] : rsplit; + + // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo + int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill); + int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill); cost += fill * (float(lrest) * larea + float(rrest) * rarea); @@ -973,8 +1049,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or if (depth >= kMeshletMaxTreeDepth) return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); - if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices)) - return; + if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices) + return bvhPackLeaf(boundary, count); unsigned int* axes[3] = {orderx, ordery, orderz}; @@ -983,9 +1059,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or // if we could not pack the meshlet, we must be vertex bound size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles; - - // only use fill weight if we are optimizing for triangle count - float fill = count <= max_triangles ? 0.f : fill_weight; + size_t maxfill = count <= max_triangles ? max_vertices : max_triangles; // find best split that minimizes SAH int bestk = -1; @@ -994,8 +1068,20 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or for (int k = 0; k < 3; ++k) { + float* areas = static_cast(scratch); + unsigned int* vertices = NULL; + + bvhComputeArea(areas, boxes, axes[k], count); + + if (count <= max_triangles) + { + // for vertex bound clusters, count number of unique vertices for each split + vertices = reinterpret_cast(areas + 2 * count); + bvhCountVertices(axes[k], count, used, indices, vertices); + } + float axiscost = FLT_MAX; - size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost); + size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost); if (axissplit && axiscost < bestcost) { @@ -1044,7 +1130,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ assert(index_count % 3 == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned (void)kMeshletMaxVertices; (void)kMeshletMaxTriangles; @@ -1069,9 +1154,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned - assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets + assert(cone_weight >= 0 && cone_weight <= 1); assert(split_factor >= 0); if (index_count == 0) @@ -1123,7 +1207,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); // initial seed triangle is the one closest to the corner unsigned int initial_seed = ~0u; @@ -1133,7 +1217,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle { const Cone& tri = triangles[i]; - float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false); + float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); if (initial_seed == ~0u || score < initial_score) { @@ -1173,7 +1258,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle unsigned int index = ~0u; float distance = FLT_MAX; - kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance); + kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance); best_triangle = index; split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor; @@ -1243,20 +1328,15 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) { - assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex - return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f); } @@ -1268,13 +1348,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned meshopt_Allocator allocator; // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -1289,17 +1368,14 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } -size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) { using namespace meshopt; @@ -1309,7 +1385,6 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned if (index_count == 0) return 0; @@ -1320,13 +1395,14 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int meshopt_Allocator allocator; // 3 floats plus 1 uint for sorting, or - // 2 floats for SAH costs, or + // 2 floats plus 1 uint for pivoting, or // 1 uint plus 1 byte for partitioning float* scratch = allocator.allocate(face_count * 4); // compute bounding boxes and centroids for sorting - BVHBox* boxes = allocator.allocate(face_count); + BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float); + memset(boxes + face_count, 0, sizeof(BVHBox)); unsigned int* axes = allocator.allocate(face_count * 3); unsigned int* temp = reinterpret_cast(scratch) + face_count * 3; @@ -1350,7 +1426,7 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); unsigned char* boundary = allocator.allocate(face_count); @@ -1391,13 +1467,10 @@ size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshlet_bound); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } @@ -1693,3 +1766,6 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh assert(vertex_offset <= vertex_count); memcpy(vertices, order, vertex_offset * sizeof(unsigned int)); } + +#undef SIMD_SSE +#undef SIMD_NEON diff --git a/3rdparty/meshoptimizer/src/indexgenerator.cpp b/3rdparty/meshoptimizer/src/indexgenerator.cpp index 5f36d2463..4bf9fccad 100644 --- a/3rdparty/meshoptimizer/src/indexgenerator.cpp +++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp @@ -439,6 +439,31 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns generateShadowBuffer(destination, indices, index_count, vertex_count, hasher, allocator); } +void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + VertexCustomHasher hasher = {vertex_positions, vertex_positions_stride / sizeof(float), NULL, NULL}; + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int* entry = hashLookup(table, table_size, hasher, unsigned(i), ~0u); + + if (*entry == ~0u) + *entry = unsigned(i); + + destination[i] = *entry; + } +} + void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { using namespace meshopt; diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h index 70d02fbc8..535853d80 100644 --- a/3rdparty/meshoptimizer/src/meshoptimizer.h +++ b/3rdparty/meshoptimizer/src/meshoptimizer.h @@ -1,5 +1,5 @@ /** - * meshoptimizer - version 0.23 + * meshoptimizer - version 0.25 * * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer @@ -12,7 +12,7 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 230 /* 0.23 */ +#define MESHOPTIMIZER_VERSION 250 /* 0.25 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -75,7 +75,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); /** - * Experimental: Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices + * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices * As a result, all vertices that are equivalent map to the same (new) location, with no gaps in the resulting sequence. * Equivalence is checked in two steps: vertex positions are compared for equality, and then the user-specified equality function is called (if provided). * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. @@ -85,7 +85,7 @@ MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destinat * vertex_positions should have float3 position in the first 12 bytes of each vertex * callback can be NULL if no additional equality check is needed; otherwise, it should return 1 if vertices with specified indices are equivalent and 0 if they are not */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context); +MESHOPTIMIZER_API size_t meshopt_generateVertexRemapCustom(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, int (*callback)(void*, unsigned int, unsigned int), void* context); /** * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap @@ -124,6 +124,16 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati */ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); +/** + * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index. + * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering. + * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + /** * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology * Each triangle is converted into a 6-vertex patch with the following layout: @@ -154,8 +164,8 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** - * Experimental: Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table - * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using nointerpolate attribute. + * Generate index buffer that can be used for visibility buffer rendering and returns the size of the reorder table + * Each triangle's provoking vertex index is equal to primitive id; this allows passing it to the fragment shader using flat/nointerpolation attribute. * This is important for performance on hardware where primitive id can't be accessed efficiently in fragment shader. * The reorder table stores the original vertex id for each vertex in the new index buffer, and should be used in the vertex shader to load vertex data. * The provoking vertex is assumed to be the first vertex in the triangle; if this is not the case (OpenGL), rotate each triangle (abc -> bca) before rendering. @@ -164,7 +174,7 @@ MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* des * destination must contain enough space for the resulting index buffer (index_count elements) * reorder must contain enough space for the worst case reorder table (vertex_count + index_count/3 elements) */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count); +MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* destination, unsigned int* reorder, const unsigned int* indices, size_t index_count, size_t vertex_count); /** * Vertex transform cache optimizer @@ -241,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count); /** - * Set index encoder format version + * Set index encoder format version (defaults to 1) + * * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+) */ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); @@ -293,23 +304,27 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first. * * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); /** - * Experimental: Vertex buffer encoder + * Vertex buffer encoder * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level. - * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion. + * For compression level to take effect, the vertex encoding version must be set to 1. * The default compression level implied by meshopt_encodeVertexBuffer is 2. * + * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0. */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version); +MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version); /** - * Set vertex encoder format version + * Set vertex encoder format version (defaults to 1) + * * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+) */ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); @@ -321,6 +336,7 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); * The decoder is safe to use for untrusted input, but it may produce garbage data. * * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size); @@ -343,10 +359,14 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s * * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. + * + * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. */ MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); /** * Vertex buffer filter encoders @@ -363,6 +383,10 @@ MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_ * meshopt_encodeFilterExp encodes arbitrary (finite) floating-point data with 8-bit exponent and K-bit integer mantissa (1 <= K <= 24). * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4. * Input data must contain stride/4 floats for every vector (count*stride/4 total). + * + * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization. + * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8. + * Input data must contain 4 floats for every color (count*4 total). */ enum meshopt_EncodeExpMode { @@ -379,6 +403,7 @@ enum meshopt_EncodeExpMode MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); /** * Simplification options @@ -391,18 +416,34 @@ enum meshopt_SimplifySparse = 1 << 1, /* Treat error limit and resulting error as absolute instead of relative to mesh extents. */ meshopt_SimplifyErrorAbsolute = 1 << 2, - /* Experimental: remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */ + /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */ meshopt_SimplifyPrune = 1 << 3, + /* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */ + meshopt_SimplifyRegularize = 1 << 4, + /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */ + meshopt_SimplifyPermissive = 1 << 5, +}; + +/** + * Experimental: Simplification vertex flags/locks, for use in `vertex_lock` arrays in simplification APIs + */ +enum +{ + /* Do not move this vertex. */ + meshopt_SimplifyVertex_Lock = 1 << 0, + /* Protect attribute discontinuity at this vertex; must be used together with meshopt_SimplifyPermissive option. */ + meshopt_SimplifyVertex_Protect = 1 << 1, }; /** * Mesh simplifier * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. - * If not all attributes from the input mesh are required, it's recommended to reindex the mesh without them prior to simplification. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. * Returns the number of indices after simplification, with destination containing new index data + * * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! * vertex_positions should have float3 position in the first 12 bytes of each vertex @@ -414,50 +455,86 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig /** * Mesh simplifier with attribute metric - * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details. - * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes. + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. + * Similar to meshopt_simplify, but incorporates attribute values into the error metric used to prioritize simplification order. + * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. + * Returns the number of indices after simplification, with destination containing new index data * + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * Note that the number of attributes with non-zero weights affects memory requirements and running time. + * + * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! + * vertex_positions should have float3 position in the first 12 bytes of each vertex * vertex_attributes should have attribute_count floats for each vertex * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position * attribute_count must be <= 32 * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); +/** + * Experimental: Mesh simplifier with position/attribute update + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. + * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance. + * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. + * If not all attributes from the input mesh are needed, it's recommended to reindex the mesh without them prior to simplification. + * Returns the number of indices after simplification, indices are destructively updated with new index data + * + * The updated index buffer references vertices from the original vertex buffer, however the vertex positions and attributes are updated in-place. + * Creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended; if the original vertex data is needed, it should be copied before simplification. + * Note that the number of attributes with non-zero weights affects memory requirements and running time. Attributes with zero weights are not updated. + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * vertex_attributes should have attribute_count floats for each vertex + * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position + * attribute_count must be <= 32 + * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default + * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); + /** * Experimental: Mesh simplifier (sloppy) * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error. * Returns the number of indices after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! * vertex_positions should have float3 position in the first 12 bytes of each vertex + * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; vertices that can't be moved should set 1 consistently for all indices with the same position * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); /** - * Experimental: Mesh simplifier (pruner) + * Mesh simplifier (pruner) * Reduces the number of triangles in the mesh by removing small isolated parts of the mesh * Returns the number of indices after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); +MESHOPTIMIZER_API size_t meshopt_simplifyPrune(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); /** * Point cloud simplifier * Reduces the number of points in the cloud to reach the given target * Returns the number of points after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. - * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * If the original vertex data isn't needed, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer (target_vertex_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex @@ -548,12 +625,12 @@ struct meshopt_CoverageStatistics }; /** - * Experimental: Coverage analyzer + * Coverage analyzer * Returns coverage statistics (ratio of viewport pixels covered from each axis) using a software rasterizer * * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API struct meshopt_CoverageStatistics meshopt_analyzeCoverage(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Meshlet is a small mesh cluster (subset) that consists of: @@ -582,10 +659,10 @@ struct meshopt_Meshlet * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first. * * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4) + * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512) * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); @@ -596,14 +673,13 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t m * Experimental: Meshlet builder with flexible cluster sizes * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet. * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor. - * Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value. * - * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) - * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) + * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold */ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); @@ -612,14 +688,14 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshl * Experimental: Meshlet builder that produces clusters optimized for raytracing * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet. * - * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSplit(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); /** * Meshlet optimizer @@ -674,26 +750,26 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsig MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** - * Experimental: Sphere bounds generator + * Sphere bounds generator * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0. * * positions should have float3 position in the first 12 bytes of each element * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element */ -MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride); +MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride); /** - * Experimental: Cluster partitioner + * Cluster partitioner * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other. * - * destination must contain enough space for the resulting partiotion data (cluster_count elements) + * destination must contain enough space for the resulting partition data (cluster_count elements) * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used) * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); +MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); /** * Spatial sorter @@ -706,13 +782,23 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destin MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** - * Experimental: Spatial sorter + * Spatial sorter * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache. * * destination must contain enough space for the resulting index buffer (index_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Spatial clusterizer + * Reorders points into clusters optimized for spatial locality, and generates a new index buffer. + * Ensures the output can be split into cluster_size chunks where each chunk has good positional locality. Only the last chunk will be smaller than cluster_size. + * + * destination must contain enough space for the resulting index buffer (vertex_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex + */ +MESHOPTIMIZER_API void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size); /** * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value @@ -813,13 +899,18 @@ template inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count); template inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); +inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level); template inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); template inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); template +inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = NULL); +template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL); template +inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); +template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); template inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index); @@ -838,7 +929,7 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* template inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); template -inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template @@ -877,14 +968,21 @@ inline int meshopt_quantizeSnorm(float v, int N) class meshopt_Allocator { public: - template - struct StorageT + struct Storage { - static void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t); - static void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*); + void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t); + void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*); }; - typedef StorageT Storage; +#ifdef MESHOPTIMIZER_ALLOC_EXPORT + MESHOPTIMIZER_API static Storage& storage(); +#else + static Storage& storage() + { + static Storage s = {::operator new, ::operator delete }; + return s; + } +#endif meshopt_Allocator() : blocks() @@ -895,14 +993,14 @@ public: ~meshopt_Allocator() { for (size_t i = count; i > 0; --i) - Storage::deallocate(blocks[i - 1]); + storage().deallocate(blocks[i - 1]); } template T* allocate(size_t size) { assert(count < sizeof(blocks) / sizeof(blocks[0])); - T* result = static_cast(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T))); + T* result = static_cast(storage().allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T))); blocks[count++] = result; return result; } @@ -910,7 +1008,7 @@ public: void deallocate(void* ptr) { assert(count > 0 && blocks[count - 1] == ptr); - Storage::deallocate(ptr); + storage().deallocate(ptr); count--; } @@ -918,12 +1016,6 @@ private: void* blocks[24]; size_t count; }; - -// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker -template -void* (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT::allocate)(size_t) = operator new; -template -void (MESHOPTIMIZER_ALLOC_CALLCONV* meshopt_Allocator::StorageT::deallocate)(void*) = operator delete; #endif /* Inline implementation for C++ templated wrappers */ @@ -945,7 +1037,7 @@ struct meshopt_IndexAdapter { size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int); - data = static_cast(meshopt_Allocator::Storage::allocate(size)); + data = static_cast(meshopt_Allocator::storage().allocate(size)); if (input) { @@ -962,7 +1054,7 @@ struct meshopt_IndexAdapter result[i] = T(data[i]); } - meshopt_Allocator::Storage::deallocate(data); + meshopt_Allocator::storage().deallocate(data); } }; @@ -1161,6 +1253,11 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size); } +inline size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level) +{ + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, level, -1); +} + template inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error) { @@ -1179,13 +1276,30 @@ inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, s return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error); } +template +inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error) +{ + meshopt_IndexAdapter inout(indices, indices, index_count); + + return meshopt_simplifyWithUpdate(inout.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, result_error); +} + template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error) { meshopt_IndexAdapter in(NULL, indices, index_count); meshopt_IndexAdapter out(destination, NULL, index_count); - return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error); + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error); +} + +template +inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifyPrune(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_error); } template @@ -1263,11 +1377,11 @@ inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* } template -inline size_t meshopt_buildMeshletsSplit(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) +inline size_t meshopt_buildMeshletsSpatial(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) { meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_buildMeshletsSplit(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight); + return meshopt_buildMeshletsSpatial(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, fill_weight); } template diff --git a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp index cc22dbcff..682b924a9 100644 --- a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp +++ b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp @@ -10,24 +10,24 @@ namespace meshopt { -static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) +static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); float mesh_centroid[3] = {}; - for (size_t i = 0; i < index_count; ++i) + for (size_t i = 0; i < vertex_count; ++i) { - const float* p = vertex_positions + vertex_stride_float * indices[i]; + const float* p = vertex_positions + vertex_stride_float * i; mesh_centroid[0] += p[0]; mesh_centroid[1] += p[1]; mesh_centroid[2] += p[2]; } - mesh_centroid[0] /= index_count; - mesh_centroid[1] /= index_count; - mesh_centroid[2] /= index_count; + mesh_centroid[0] /= float(vertex_count); + mesh_centroid[1] /= float(vertex_count); + mesh_centroid[2] /= float(vertex_count); for (size_t cluster = 0; cluster < cluster_count; ++cluster) { @@ -306,7 +306,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind // fill sort data float* sort_data = allocator.allocate(cluster_count); - calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count); + calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, clusters, cluster_count); // sort clusters using sort data unsigned short* sort_keys = allocator.allocate(cluster_count); diff --git a/3rdparty/meshoptimizer/src/partition.cpp b/3rdparty/meshoptimizer/src/partition.cpp index 867211d66..3edc86442 100644 --- a/3rdparty/meshoptimizer/src/partition.cpp +++ b/3rdparty/meshoptimizer/src/partition.cpp @@ -5,6 +5,8 @@ #include #include +// This work is based on: +// Takio Kurita. An efficient agglomerative clustering algorithm using a heap. 1991 namespace meshopt { diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp index 1abcd51f5..f1effc38e 100644 --- a/3rdparty/meshoptimizer/src/simplifier.cpp +++ b/3rdparty/meshoptimizer/src/simplifier.cpp @@ -27,6 +27,7 @@ // Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 // Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019 // Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999 +// Hugues Hoppe, Steve Marschner. Efficient Minimization of New Quadric Metric for Simplifying Meshes with Appearance Attributes. 2000 namespace meshopt { @@ -316,11 +317,13 @@ const unsigned char kCanCollapse[Kind_Count][Kind_Count] = { // if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge // note that for seam edges, the opposite edge isn't present in the attribute-based topology // but is present if you consider a position-only mesh variant +// while many complex collapses have the opposite edge, since complex vertices collapse to the +// same wedge, keeping opposite edges separate improves the quality by considering both targets const unsigned char kHasOpposite[Kind_Count][Kind_Count] = { - {1, 1, 1, 0, 1}, + {1, 1, 1, 1, 1}, {1, 0, 1, 0, 0}, {1, 1, 1, 0, 1}, - {0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0}, {1, 0, 1, 0, 0}, }; @@ -336,6 +339,25 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int return false; } +static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b, const unsigned int* remap, const unsigned int* wedge) +{ + unsigned int v = a; + + do + { + unsigned int count = adjacency.offsets[v + 1] - adjacency.offsets[v]; + const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[v]; + + for (size_t i = 0; i < count; ++i) + if (remap[edges[i].next] == remap[b]) + return true; + + v = wedge[v]; + } while (v != a); + + return false; +} + static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_lock, const unsigned int* sparse_remap, unsigned int options) { memset(loop, -1, vertex_count * sizeof(unsigned int)); @@ -394,6 +416,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned { result[i] = Kind_Manifold; } + else if (openi != ~0u && openo != ~0u && remap[openi] == remap[openo] && openi != i) + { + // classify half-seams as seams (the branch below would mis-classify them as borders) + // half-seam is a single vertex that connects to both vertices of a potential seam + // treating these as seams allows collapsing the "full" seam vertex onto them + result[i] = Kind_Seam; + } else if (openi != i && openo != i) { result[i] = Kind_Border; @@ -446,15 +475,50 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned } } + if (options & meshopt_SimplifyPermissive) + for (size_t i = 0; i < vertex_count; ++i) + if (result[i] == Kind_Seam || result[i] == Kind_Locked) + { + if (remap[i] != i) + { + // only process primary vertices; wedges will be updated to match the primary vertex + result[i] = result[remap[i]]; + continue; + } + + bool protect = false; + + // vertex_lock may protect any wedge, not just the primary vertex, so we switch to complex only if no wedges are protected + unsigned int v = unsigned(i); + do + { + unsigned int rv = sparse_remap ? sparse_remap[v] : v; + protect |= vertex_lock && (vertex_lock[rv] & meshopt_SimplifyVertex_Protect) != 0; + v = wedge[v]; + } while (v != i); + + // protect if any adjoining edge doesn't have an opposite edge (indicating vertex is on the border) + do + { + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[v]]; + size_t count = adjacency.offsets[v + 1] - adjacency.offsets[v]; + + for (size_t j = 0; j < count; ++j) + protect |= !hasEdge(adjacency, edges[j].next, v, remap, wedge); + v = wedge[v]; + } while (v != i); + + result[i] = protect ? result[i] : int(Kind_Complex); + } + if (vertex_lock) { // vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges for (size_t i = 0; i < vertex_count; ++i) { unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); - assert(vertex_lock[ri] <= 1); // values other than 0/1 are reserved for future use - if (vertex_lock[ri]) + if (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) result[remap[i]] = Kind_Locked; } @@ -479,7 +543,7 @@ struct Vector3 float x, y, z; }; -static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL) +static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned int* sparse_remap = NULL, float* out_offset = NULL) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -525,6 +589,13 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat } } + if (out_offset) + { + out_offset[0] = minv[0]; + out_offset[1] = minv[1]; + out_offset[2] = minv[2]; + } + return extent; } @@ -546,11 +617,45 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data } } +static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update) +{ + size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float); + size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); + + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); + + const Vector3& p = vertex_positions[i]; + float* v = vertex_positions_data + ri * vertex_positions_stride_float; + + v[0] = p.x * vertex_scale + vertex_offset[0]; + v[1] = p.y * vertex_scale + vertex_offset[1]; + v[2] = p.z * vertex_scale + vertex_offset[2]; + + if (attribute_count) + { + const float* sa = vertex_attributes + i * attribute_count; + float* va = vertex_attributes_data + ri * vertex_attributes_stride_float; + + for (size_t k = 0; k < attribute_count; ++k) + { + unsigned int rk = attribute_remap[k]; + + va[rk] = sa[k] / attribute_weights[rk]; + } + } + } +} + static const size_t kMaxAttributes = 32; struct Quadric { - // a00*x^2 + a11*y^2 + a22*z^2 + 2*(a10*xy + a20*xz + a21*yz) + b0*x + b1*y + b2*z + c + // a00*x^2 + a11*y^2 + a22*z^2 + 2*a10*xy + 2*a20*xz + 2*a21*yz + 2*b0*x + 2*b1*y + 2*b2*z + c float a00, a11, a22; float a10, a20, a21; float b0, b1, b2, c; @@ -612,6 +717,14 @@ static void quadricAdd(Quadric& Q, const Quadric& R) Q.w += R.w; } +static void quadricAdd(QuadricGrad& G, const QuadricGrad& R) +{ + G.gx += R.gx; + G.gy += R.gy; + G.gz += R.gz; + G.gw += R.gw; +} + static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count) { for (size_t k = 0; k < attribute_count; ++k) @@ -694,6 +807,17 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo Q.w = w; } +static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w) +{ + Q.a00 = Q.a11 = Q.a22 = w; + Q.a10 = Q.a20 = Q.a21 = 0; + Q.b0 = -x * w; + Q.b1 = -y * w; + Q.b2 = -z * w; + Q.c = (x * x + y * y + z * z) * w; + Q.w = w; +} + static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) { Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; @@ -814,7 +938,112 @@ static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, } } -static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) +static void quadricVolumeGradient(QuadricGrad& G, const Vector3& p0, const Vector3& p1, const Vector3& p2) +{ + Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; + Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; + + // normal = cross(p1 - p0, p2 - p0) + Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x}; + float area = normalize(normal) * 0.5f; + + G.gx = normal.x * area; + G.gy = normal.y * area; + G.gz = normal.z * area; + G.gw = (-p0.x * normal.x - p0.y * normal.y - p0.z * normal.z) * area; +} + +static bool quadricSolve(Vector3& p, const Quadric& Q, const QuadricGrad& GV) +{ + // solve A*p = -b where A is the quadric matrix and b is the linear term + float a00 = Q.a00, a11 = Q.a11, a22 = Q.a22; + float a10 = Q.a10, a20 = Q.a20, a21 = Q.a21; + float x0 = -Q.b0, x1 = -Q.b1, x2 = -Q.b2; + + float eps = 1e-6f * Q.w; + + // LDL decomposition: A = LDL^T + float d0 = a00; + float l10 = a10 / d0; + float l20 = a20 / d0; + + float d1 = a11 - a10 * l10; + float dl21 = a21 - a20 * l10; + float l21 = dl21 / d1; + + float d2 = a22 - a20 * l20 - dl21 * l21; + + // solve L*y = x + float y0 = x0; + float y1 = x1 - l10 * y0; + float y2 = x2 - l20 * y0 - l21 * y1; + + // solve D*z = y + float z0 = y0 / d0; + float z1 = y1 / d1; + float z2 = y2 / d2; + + // augment system with linear constraint GV using Lagrange multiplier + float a30 = GV.gx, a31 = GV.gy, a32 = GV.gz; + float x3 = -GV.gw; + + float l30 = a30 / d0; + float dl31 = a31 - a30 * l10; + float l31 = dl31 / d1; + float dl32 = a32 - a30 * l20 - dl31 * l21; + float l32 = dl32 / d2; + float d3 = 0.f - a30 * l30 - dl31 * l31 - dl32 * l32; + + float y3 = x3 - l30 * y0 - l31 * y1 - l32 * y2; + float z3 = fabsf(d3) > eps ? y3 / d3 : 0.f; // if d3 is zero, we can ignore the constraint + + // substitute L^T*p = z + float lambda = z3; + float pz = z2 - l32 * lambda; + float py = z1 - l21 * pz - l31 * lambda; + float px = z0 - l10 * py - l20 * pz - l30 * lambda; + + p.x = px; + p.y = py; + p.z = pz; + + return fabsf(d0) > eps && fabsf(d1) > eps && fabsf(d2) > eps; +} + +static void quadricReduceAttributes(Quadric& Q, const Quadric& A, const QuadricGrad* G, size_t attribute_count) +{ + // update vertex quadric with attribute quadric; multiply by vertex weight to minimize normalized error + Q.a00 += A.a00 * Q.w; + Q.a11 += A.a11 * Q.w; + Q.a22 += A.a22 * Q.w; + Q.a10 += A.a10 * Q.w; + Q.a20 += A.a20 * Q.w; + Q.a21 += A.a21 * Q.w; + Q.b0 += A.b0 * Q.w; + Q.b1 += A.b1 * Q.w; + Q.b2 += A.b2 * Q.w; + + float iaw = A.w == 0 ? 0.f : Q.w / A.w; + + // update linear system based on attribute gradients (BB^T/a) + for (size_t k = 0; k < attribute_count; ++k) + { + const QuadricGrad& g = G[k]; + + Q.a00 -= (g.gx * g.gx) * iaw; + Q.a11 -= (g.gy * g.gy) * iaw; + Q.a22 -= (g.gz * g.gz) * iaw; + Q.a10 -= (g.gx * g.gy) * iaw; + Q.a20 -= (g.gx * g.gz) * iaw; + Q.a21 -= (g.gy * g.gz) * iaw; + + Q.b0 -= (g.gx * g.gw) * iaw; + Q.b1 -= (g.gy * g.gw) * iaw; + Q.b2 -= (g.gz * g.gw) * iaw; + } +} + +static void fillFaceQuadrics(Quadric* vertex_quadrics, QuadricGrad* volume_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) { for (size_t i = 0; i < index_count; i += 3) { @@ -828,6 +1057,36 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); quadricAdd(vertex_quadrics[remap[i2]], Q); + + if (volume_gradients) + { + QuadricGrad GV; + quadricVolumeGradient(GV, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2]); + + quadricAdd(volume_gradients[remap[i0]], GV); + quadricAdd(volume_gradients[remap[i1]], GV); + quadricAdd(volume_gradients[remap[i2]], GV); + } + } +} + +static void fillVertexQuadrics(Quadric* vertex_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* remap, unsigned int options) +{ + // by default, we use a very small weight to improve triangulation and numerical stability without affecting the shape or error + float factor = (options & meshopt_SimplifyRegularize) ? 1e-1f : 1e-7f; + + for (size_t i = 0; i < vertex_count; ++i) + { + if (remap[i] != i) + continue; + + const Vector3& p = vertex_positions[i]; + float w = vertex_quadrics[i].w * factor; + + Quadric Q; + quadricFromPoint(Q, p.x, p.y, p.z, w); + + quadricAdd(vertex_quadrics[i], Q); } } @@ -857,15 +1116,11 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0) continue; - // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges - if (kHasOpposite[k0][k1] && remap[i1] > remap[i0]) - continue; - unsigned int i2 = indices[i + next[e + 1]]; // we try hard to maintain border edge geometry; seam edges can move more freely // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical - const float kEdgeWeightSeam = 1.f; + const float kEdgeWeightSeam = 0.5f; // applied twice due to opposite edges const float kEdgeWeightBorder = 10.f; float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam; @@ -873,6 +1128,13 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic Quadric Q; quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); + Quadric QT; + quadricFromTriangle(QT, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); + + // mix edge quadric with triangle quadric to stabilize collapses in both directions; both quadrics inherit edge weight so that their error is added + QT.w = 0; + quadricAdd(Q, QT); + quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); } @@ -954,6 +1216,50 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert return false; } +static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0, const Vector3& v1) +{ + const Vector3& v0 = vertex_positions[i0]; + + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]]; + size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0]; + + for (size_t i = 0; i < count; ++i) + { + unsigned int a = edges[i].next, b = edges[i].prev; + + if (hasTriangleFlip(vertex_positions[a], vertex_positions[b], v0, v1)) + return true; + } + + return false; +} + +static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, unsigned int i0) +{ + const Vector3& v0 = vertex_positions[i0]; + + const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]]; + size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0]; + + float result = 0.f; + + for (size_t i = 0; i < count; ++i) + { + unsigned int a = edges[i].next, b = edges[i].prev; + + const Vector3& va = vertex_positions[a]; + const Vector3& vb = vertex_positions[b]; + + float da = (va.x - v0.x) * (va.x - v0.x) + (va.y - v0.y) * (va.y - v0.y) + (va.z - v0.z) * (va.z - v0.z); + float db = (vb.x - v0.x) * (vb.x - v0.x) + (vb.y - v0.y) * (vb.y - v0.y) + (vb.z - v0.z) * (vb.z - v0.z); + + result = result < da ? da : result; + result = result < db ? db : result; + } + + return sqrtf(result); +} + static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind) { size_t dual_count = 0; @@ -1008,19 +1314,11 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c // two vertices are on a border or a seam, but there's no direct edge between them // this indicates that they belong to two different edge loops and we should not collapse this edge - // loop[] tracks half edges so we only need to check i0->i1 - if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) + // loop[] and loopback[] track half edges so we only need to check one of them + if ((k0 == Kind_Border || k0 == Kind_Seam) && k1 != Kind_Manifold && loop[i0] != i1) + continue; + if ((k1 == Kind_Border || k1 == Kind_Seam) && k0 != Kind_Manifold && loopback[i1] != i0) continue; - - if (k0 == Kind_Locked || k1 == Kind_Locked) - { - // the same check as above, but for border/seam -> locked collapses - // loop[] and loopback[] track half edges so we only need to check one of them - if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) - continue; - if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0) - continue; - } // edge can be collapsed in either direction - we will pick the one with minimum error // note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional @@ -1052,14 +1350,10 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const unsigned int i0 = c.v0; unsigned int i1 = c.v1; - - // most edges are bidirectional which means we need to evaluate errors for two collapses - // to keep this code branchless we just use the same edge for unidirectional edges - unsigned int j0 = c.bidi ? i1 : i0; - unsigned int j1 = c.bidi ? i0 : i1; + bool bidi = c.bidi; float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]); - float ej = c.bidi ? quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]) : FLT_MAX; + float ej = bidi ? quadricError(vertex_quadrics[remap[i1]], vertex_positions[i0]) : FLT_MAX; #if TRACE >= 3 float di = ei, dj = ej; @@ -1068,39 +1362,53 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const if (attribute_count) { ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); - ej += c.bidi ? quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]) : 0; + ej += bidi ? quadricError(attribute_quadrics[i1], &attribute_gradients[i1 * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]) : 0; - // note: seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate + // seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate if (vertex_kind[i0] == Kind_Seam) { // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) unsigned int s0 = wedge[i0]; unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; - assert(s0 != i0 && wedge[s0] == i0); + assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams assert(s1 != ~0u && remap[s1] == remap[i1]); // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe s1 = (s1 != ~0u) ? s1 : wedge[i1]; ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]); - ej += c.bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0; + ej += bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0; + } + else + { + // complex edges can have multiple wedges, so we need to aggregate errors for all wedges + // this is different from seams (where we aggregate pairwise) because all wedges collapse onto the same target + if (vertex_kind[i0] == Kind_Complex) + for (unsigned int v = wedge[i0]; v != i0; v = wedge[v]) + ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); + + if (vertex_kind[i1] == Kind_Complex && bidi) + for (unsigned int v = wedge[i1]; v != i1; v = wedge[v]) + ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]); } } - // pick edge direction with minimal error - c.v0 = ei <= ej ? i0 : j0; - c.v1 = ei <= ej ? i1 : j1; - c.error = ei <= ej ? ei : ej; + // pick edge direction with minimal error (branchless) + bool rev = bidi & (ej < ei); + + c.v0 = rev ? i1 : i0; + c.v1 = rev ? i0 : i1; + c.error = ej < ei ? ej : ei; #if TRACE >= 3 - if (i0 == j0) // c.bidi has been overwritten - printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", c.v0, c.v1, - sqrtf(c.error), sqrtf(ei <= ej ? di : dj), sqrtf(ei <= ej ? ei - di : ej - dj)); + if (bidi) + printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n", + rev ? i1 : i0, rev ? i0 : i1, + sqrtf(rev ? ej : ei), sqrtf(rev ? dj : di), sqrtf(rev ? ej - dj : ei - di), + sqrtf(rev ? ei : ej), sqrtf(rev ? di : dj), sqrtf(rev ? ei - di : ej - dj)); else - printf("edge eval %d -> %d: error %f (pos %f, attr %f); reverse %f (pos %f, attr %f)\n", c.v0, c.v1, - sqrtf(ei <= ej ? ei : ej), sqrtf(ei <= ej ? di : dj), sqrtf(ei <= ej ? ei - di : ej - dj), - sqrtf(ei <= ej ? ej : ei), sqrtf(ei <= ej ? dj : di), sqrtf(ei <= ej ? ej - dj : ei - di)); + printf("edge eval %d -> %d: error %f (pos %f, attr %f)\n", i0, i1, sqrtf(c.error), sqrtf(di), sqrtf(ei - di)); #endif } } @@ -1243,7 +1551,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) unsigned int s0 = wedge[i0]; unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; - assert(s0 != i0 && wedge[s0] == i0); + assert(wedge[s0] == i0); // s0 may be equal to i0 for half-seams assert(s1 != ~0u && remap[s1] == r1); // additional asserts to verify that the seam pair is consistent @@ -1289,7 +1597,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* return edge_collapses; } -static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error) +static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_count, Quadric* vertex_quadrics, QuadricGrad* volume_gradients, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Vector3* vertex_positions, const unsigned int* remap, float& vertex_error) { for (size_t i = 0; i < vertex_count; ++i) { @@ -1304,8 +1612,13 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou // ensure we only update vertex_quadrics once: primary vertex must be moved if any wedge is moved if (i0 == r0) + { quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); + if (volume_gradients) + quadricAdd(volume_gradients[r1], volume_gradients[r0]); + } + if (attribute_count) { quadricAdd(attribute_quadrics[i1], attribute_quadrics[i0]); @@ -1321,7 +1634,116 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou } } -static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap) +static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) +{ +#if TRACE + size_t stats[5] = {}; +#endif + + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + // moving externally locked vertices is prohibited + // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds + // moving vertices on a border requires a stronger edge quadric to preserve the border geometry + if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border) + continue; + + if (remap[i] != i) + { + vertex_positions[i] = vertex_positions[remap[i]]; + continue; + } + + TRACESTATS(0); + + const Vector3& vp = vertex_positions[i]; + + Quadric Q = vertex_quadrics[i]; + QuadricGrad GV = {}; + + // add a point quadric for regularization to stabilize the solution + Quadric R; + quadricFromPoint(R, vp.x, vp.y, vp.z, Q.w * 1e-4f); + quadricAdd(Q, R); + + if (attribute_count) + { + // optimal point simultaneously minimizes attribute quadrics for all wedges + unsigned int v = unsigned(i); + do + { + quadricReduceAttributes(Q, attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count); + v = wedge[v]; + } while (v != i); + + // minimizing attribute quadrics results in volume loss so we incorporate volume gradient as a constraint + if (volume_gradients) + GV = volume_gradients[i]; + } + + Vector3 p; + if (!quadricSolve(p, Q, GV)) + { + TRACESTATS(2); + continue; + } + + // reject updates that move the vertex too far from its neighborhood + // this detects and fixes most cases when the quadric is not well-defined + float nr = getNeighborhoodRadius(adjacency, vertex_positions, unsigned(i)); + float dp = (p.x - vp.x) * (p.x - vp.x) + (p.y - vp.y) * (p.y - vp.y) + (p.z - vp.z) * (p.z - vp.z); + + if (dp > nr * nr) + { + TRACESTATS(3); + continue; + } + + // reject updates that would flip a neighboring triangle, as we do for edge collapse + if (hasTriangleFlips(adjacency, vertex_positions, unsigned(i), p)) + { + TRACESTATS(4); + continue; + } + + TRACESTATS(1); + vertex_positions[i] = p; + } + +#if TRACE + printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4])); +#endif + + if (attribute_count == 0) + return; + + for (size_t i = 0; i < vertex_count; ++i) + { + if (!vertex_update[i]) + continue; + + // updating externally locked vertices is prohibited + if (vertex_kind[i] == Kind_Locked) + continue; + + const Vector3& p = vertex_positions[remap[i]]; + const Quadric& A = attribute_quadrics[i]; + + float iw = A.w == 0 ? 0.f : 1.f / A.w; + + for (size_t k = 0; k < attribute_count; ++k) + { + const QuadricGrad& G = attribute_gradients[i * attribute_count + k]; + + vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + } + } +} + +static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap, const unsigned int* remap) { size_t write = 0; @@ -1336,7 +1758,14 @@ static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const assert(collapse_remap[v1] == v1); assert(collapse_remap[v2] == v2); - if (v0 != v1 && v0 != v2 && v1 != v2) + // collapse zero area triangles even if they are not topologically degenerate + // this is required to cleanup manifold->seam collapses when a vertex is collapsed onto a seam pair + // as well as complex collapses and some other cases where cross wedge collapses are performed + unsigned int r0 = remap[v0]; + unsigned int r1 = remap[v1]; + unsigned int r2 = remap[v2]; + + if (r0 != r1 && r0 != r2 && r1 != r2) { indices[write + 0] = v0; indices[write + 1] = v1; @@ -1494,18 +1923,24 @@ static void measureComponents(float* component_errors, size_t component_count, c static size_t pruneComponents(unsigned int* indices, size_t index_count, const unsigned int* components, const float* component_errors, size_t component_count, float error_cutoff, float& nexterror) { + (void)component_count; + size_t write = 0; + float min_error = FLT_MAX; for (size_t i = 0; i < index_count; i += 3) { - unsigned int c = components[indices[i]]; - assert(c == components[indices[i + 1]] && c == components[indices[i + 2]]); + unsigned int v0 = indices[i + 0], v1 = indices[i + 1], v2 = indices[i + 2]; + unsigned int c = components[v0]; + assert(c == components[v1] && c == components[v2]); if (component_errors[c] > error_cutoff) { - indices[write + 0] = indices[i + 0]; - indices[write + 1] = indices[i + 1]; - indices[write + 2] = indices[i + 2]; + min_error = min_error > component_errors[c] ? component_errors[c] : min_error; + + indices[write + 0] = v0; + indices[write + 1] = v1; + indices[write + 2] = v2; write += 3; } } @@ -1515,15 +1950,11 @@ static size_t pruneComponents(unsigned int* indices, size_t index_count, const u for (size_t i = 0; i < component_count; ++i) pruned_components += (component_errors[i] >= nexterror && component_errors[i] <= error_cutoff); - printf("pruned %d triangles in %d components (goal %e)\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff)); + printf("pruned %d triangles in %d components (goal %e); next %e\n", int((index_count - write) / 3), int(pruned_components), sqrtf(error_cutoff), min_error < FLT_MAX ? sqrtf(min_error) : min_error * 2); #endif - // update next error with the smallest error of the remaining components for future pruning - nexterror = FLT_MAX; - for (size_t i = 0; i < component_count; ++i) - if (component_errors[i] > error_cutoff) - nexterror = nexterror > component_errors[i] ? component_errors[i] : nexterror; - + // update next error with the smallest error of the remaining components + nexterror = min_error; return write; } @@ -1588,7 +2019,7 @@ struct TriangleHasher } }; -static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size) +static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, const unsigned char* vertex_lock, size_t vertex_count, int grid_size) { assert(grid_size >= 1 && grid_size <= 1024); float cell_scale = float(grid_size - 1); @@ -1601,7 +2032,10 @@ static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_pos int yi = int(v.y * cell_scale + 0.5f); int zi = int(v.z * cell_scale + 0.5f); - vertex_ids[i] = (xi << 20) | (yi << 10) | zi; + if (vertex_lock && (vertex_lock[i] & meshopt_SimplifyVertex_Lock)) + vertex_ids[i] = (1 << 30) | unsigned(i); + else + vertex_ids[i] = (xi << 20) | (yi << 10) | zi; } } @@ -1835,9 +2269,10 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float } // namespace meshopt -// Note: this is only exposed for debug visualization purposes; do *not* use +// Note: this is only exposed for development purposes; do *not* use enum { + meshopt_SimplifyInternalSolve = 1 << 29, meshopt_SimplifyInternalDebug = 1 << 30 }; @@ -1850,7 +2285,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic assert(vertex_positions_stride % sizeof(float) == 0); assert(target_index_count <= index_count); assert(target_error >= 0); - assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyInternalDebug)) == 0); + assert((options & ~(meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute | meshopt_SimplifyPrune | meshopt_SimplifyRegularize | meshopt_SimplifyPermissive | meshopt_SimplifyInternalSolve | meshopt_SimplifyInternalDebug)) == 0); assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256); assert(vertex_attributes_stride % sizeof(float) == 0); assert(attribute_count <= kMaxAttributes); @@ -1902,14 +2337,14 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic #endif Vector3* vertex_positions = allocator.allocate(vertex_count); - float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap); + float vertex_offset[3] = {}; + float vertex_scale = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride, sparse_remap, vertex_offset); float* vertex_attributes = NULL; + unsigned int attribute_remap[kMaxAttributes]; if (attribute_count) { - unsigned int attribute_remap[kMaxAttributes]; - // remap attributes to only include ones with weight > 0 to minimize memory/compute overhead for quadrics size_t attributes_used = 0; for (size_t i = 0; i < attribute_count; ++i) @@ -1926,6 +2361,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic Quadric* attribute_quadrics = NULL; QuadricGrad* attribute_gradients = NULL; + QuadricGrad* volume_gradients = NULL; if (attribute_count) { @@ -1934,9 +2370,16 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic attribute_gradients = allocator.allocate(vertex_count * attribute_count); memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad)); + + if (options & meshopt_SimplifyInternalSolve) + { + volume_gradients = allocator.allocate(vertex_count); + memset(volume_gradients, 0, vertex_count * sizeof(QuadricGrad)); + } } - fillFaceQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap); + fillFaceQuadrics(vertex_quadrics, volume_gradients, result, index_count, vertex_positions, remap); + fillVertexQuadrics(vertex_quadrics, vertex_positions, vertex_count, remap, options); fillEdgeQuadrics(vertex_quadrics, result, index_count, vertex_positions, remap, vertex_kind, loop, loopback); if (attribute_count) @@ -2016,23 +2459,26 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic if (collapses == 0) break; - updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error); + updateQuadrics(collapse_remap, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, vertex_positions, remap, vertex_error); // updateQuadrics will update vertex error if we use attributes, but if we don't then result_error and vertex_error are equivalent vertex_error = attribute_count == 0 ? result_error : vertex_error; + // note: we update loops following edge collapses, but after this we might still have stale loop data + // this can happen when a triangle with a loop edge gets collapsed along a non-loop edge + // that works since a loop that points to a vertex that is no longer connected is not affecting collapse logic remapEdgeLoops(loop, vertex_count, collapse_remap); remapEdgeLoops(loopback, vertex_count, collapse_remap); - size_t new_count = remapIndexBuffer(result, result_count, collapse_remap); - assert(new_count < result_count); - - result_count = new_count; + result_count = remapIndexBuffer(result, result_count, collapse_remap, remap); if ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= vertex_error) result_count = pruneComponents(result, result_count, components, component_errors, component_count, vertex_error, component_nexterror); } + // at this point, component_nexterror might be stale: component it references may have been removed through a series of edge collapses + bool component_nextstale = true; + // we're done with the regular simplification but we're still short of the target; try pruning more aggressively towards error_limit while ((options & meshopt_SimplifyPrune) && result_count > target_index_count && component_nexterror <= error_limit) { @@ -2049,18 +2495,42 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic component_maxerror = component_errors[i]; size_t new_count = pruneComponents(result, result_count, components, component_errors, component_count, component_cutoff, component_nexterror); - if (new_count == result_count) + if (new_count == result_count && !component_nextstale) break; + component_nextstale = false; // pruneComponents guarantees next error is up to date result_count = new_count; result_error = result_error < component_maxerror ? component_maxerror : result_error; vertex_error = vertex_error < component_maxerror ? component_maxerror : vertex_error; } #if TRACE - printf("result: %d triangles, error: %e; total %d passes\n", int(result_count / 3), sqrtf(result_error), int(pass_count)); + printf("result: %d triangles, error: %e (pos %.3e); total %d passes\n", int(result_count / 3), sqrtf(result_error), sqrtf(vertex_error), int(pass_count)); #endif + // if solve is requested, update input buffers destructively from internal data + if (options & meshopt_SimplifyInternalSolve) + { + unsigned char* vertex_update = collapse_locked; // reuse as scratch space + memset(vertex_update, 0, vertex_count); + + // limit quadric solve to vertices that are still used in the result + for (size_t i = 0; i < result_count; ++i) + { + unsigned int v = result[i]; + + // recomputing externally locked vertices may result in floating point drift + vertex_update[v] = vertex_kind[v] != Kind_Locked; + } + + // edge adjacency may be stale as we haven't updated it after last series of edge collapses + updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); + + solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); + + finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update); + } + // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity if ((options & meshopt_SimplifyInternalDebug) && !sparse_remap) { @@ -2090,15 +2560,24 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { + assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, NULL, target_index_count, target_error, options, out_result_error); } size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { + assert((options & meshopt_SimplifyInternalSolve) == 0); // use meshopt_simplifyWithUpdate instead + return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options, out_result_error); } -size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error) +size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) +{ + return meshopt_simplifyEdge(indices, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, vertex_lock, target_index_count, target_error, options | meshopt_SimplifyInternalSolve, out_result_error); +} + +size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* out_result_error) { using namespace meshopt; @@ -2126,15 +2605,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind const int kInterpolationPasses = 5; // invariant: # of triangles in min_grid <= target_count - int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : target_error)); + int min_grid = int(1.f / (target_error < 1e-3f ? 1e-3f : (target_error < 1.f ? target_error : 1.f))); int max_grid = 1025; size_t min_triangles = 0; size_t max_triangles = index_count / 3; // when we're error-limited, we compute the triangle count for the min. size; this accelerates convergence and provides the correct answer when we can't use a larger grid - if (min_grid > 1) + if (min_grid > 1 || vertex_lock) { - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid); min_triangles = countTriangles(vertex_ids, indices, index_count); } @@ -2150,7 +2629,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind int grid_size = next_grid_size; grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, grid_size); size_t triangles = countTriangles(vertex_ids, indices, index_count); #if TRACE @@ -2192,7 +2671,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind unsigned int* vertex_cells = allocator.allocate(vertex_count); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, vertex_lock, vertex_count, min_grid); size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); // build a quadric for each target cell @@ -2213,15 +2692,15 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind for (size_t i = 0; i < cell_count; ++i) result_error = result_error < cell_errors[i] ? cell_errors[i] : result_error; - // collapse triangles! - // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :( + // vertex collapses often result in duplicate triangles; we need a table to filter them out size_t tritable_size = hashBuckets2(min_triangles); unsigned int* tritable = allocator.allocate(tritable_size); + // note: this is the first and last write to destination, which allows aliasing destination with indices size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap); #if TRACE - printf("result: %d cells, %d triangles (%d unfiltered), error %e\n", int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error)); + printf("result: grid size %d, %d cells, %d triangles (%d unfiltered), error %e\n", min_grid, int(cell_count), int(write / 3), int(min_triangles), sqrtf(result_error)); #endif if (out_result_error) @@ -2316,7 +2795,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos int grid_size = next_grid_size; grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid ? max_grid - 1 : grid_size); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, grid_size); size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count); #if TRACE @@ -2353,7 +2832,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos // build vertex->cell association by mapping all vertices with the same quantized position to the same cell unsigned int* vertex_cells = allocator.allocate(vertex_count); - computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + computeVertexIds(vertex_ids, vertex_positions, NULL, vertex_count, min_grid); size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); // accumulate points into a reservoir for each target cell diff --git a/3rdparty/meshoptimizer/src/spatialorder.cpp b/3rdparty/meshoptimizer/src/spatialorder.cpp index 43aa7bcaf..b65627900 100644 --- a/3rdparty/meshoptimizer/src/spatialorder.cpp +++ b/3rdparty/meshoptimizer/src/spatialorder.cpp @@ -22,7 +22,7 @@ inline unsigned long long part1By2(unsigned long long x) return x; } -static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +static void computeOrder(unsigned long long* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, bool morton) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); @@ -60,61 +60,158 @@ static void computeOrder(unsigned long long* result, const float* vertex_positio int y = int((v[1] - minv[1]) * scale + 0.5f); int z = int((v[2] - minv[2]) * scale + 0.5f); - result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); + if (morton) + result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); + else + result[i] = ((unsigned long long)x << 0) | ((unsigned long long)y << 20) | ((unsigned long long)z << 40); } } -static void computeHistogram(unsigned int (&hist)[1024][5], const unsigned long long* data, size_t count) +static void radixSort10(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count) { + unsigned int hist[1024]; memset(hist, 0, sizeof(hist)); - // compute 5 10-bit histograms in parallel + // compute histogram (assume keys are 10-bit) for (size_t i = 0; i < count; ++i) - { - unsigned long long id = data[i]; + hist[keys[i]]++; - hist[(id >> 0) & 1023][0]++; - hist[(id >> 10) & 1023][1]++; - hist[(id >> 20) & 1023][2]++; - hist[(id >> 30) & 1023][3]++; - hist[(id >> 40) & 1023][4]++; - } - - unsigned int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0; + unsigned int sum = 0; // replace histogram data with prefix histogram sums in-place for (int i = 0; i < 1024; ++i) { - unsigned int h0 = hist[i][0], h1 = hist[i][1], h2 = hist[i][2], h3 = hist[i][3], h4 = hist[i][4]; + unsigned int h = hist[i]; + hist[i] = sum; + sum += h; + } + + assert(sum == count); + + // reorder values + for (size_t i = 0; i < count; ++i) + { + unsigned int id = keys[source[i]]; + + destination[hist[id]++] = source[i]; + } +} + +static void computeHistogram(unsigned int (&hist)[256][2], const unsigned short* data, size_t count) +{ + memset(hist, 0, sizeof(hist)); + + // compute 2 8-bit histograms in parallel + for (size_t i = 0; i < count; ++i) + { + unsigned long long id = data[i]; + + hist[(id >> 0) & 255][0]++; + hist[(id >> 8) & 255][1]++; + } + + unsigned int sum0 = 0, sum1 = 0; + + // replace histogram data with prefix histogram sums in-place + for (int i = 0; i < 256; ++i) + { + unsigned int h0 = hist[i][0], h1 = hist[i][1]; hist[i][0] = sum0; hist[i][1] = sum1; - hist[i][2] = sum2; - hist[i][3] = sum3; - hist[i][4] = sum4; sum0 += h0; sum1 += h1; - sum2 += h2; - sum3 += h3; - sum4 += h4; } - assert(sum0 == count && sum1 == count && sum2 == count && sum3 == count && sum4 == count); + assert(sum0 == count && sum1 == count); } -static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned long long* keys, size_t count, unsigned int (&hist)[1024][5], int pass) +static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned short* keys, size_t count, unsigned int (&hist)[256][2], int pass) { - int bitoff = pass * 10; + int bitoff = pass * 8; for (size_t i = 0; i < count; ++i) { - unsigned int id = unsigned(keys[source[i]] >> bitoff) & 1023; + unsigned int id = unsigned(keys[source[i]] >> bitoff) & 255; destination[hist[id][pass]++] = source[i]; } } +static void partitionPoints(unsigned int* target, const unsigned int* order, const unsigned char* sides, size_t split, size_t count) +{ + size_t l = 0, r = split; + + for (size_t i = 0; i < count; ++i) + { + unsigned char side = sides[order[i]]; + target[side ? r : l] = order[i]; + l += 1; + l -= side; + r += side; + } + + assert(l == split && r == count); +} + +static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, const unsigned long long* keys, size_t count, void* scratch, size_t cluster_size) +{ + if (count <= cluster_size) + { + memcpy(destination, orderx, count * sizeof(unsigned int)); + return; + } + + unsigned int* axes[3] = {orderx, ordery, orderz}; + + int bestk = -1; + unsigned int bestdim = 0; + + for (int k = 0; k < 3; ++k) + { + const unsigned int mask = (1 << 20) - 1; + unsigned int dim = (unsigned(keys[axes[k][count - 1]] >> (k * 20)) & mask) - (unsigned(keys[axes[k][0]] >> (k * 20)) & mask); + + if (dim >= bestdim) + { + bestk = k; + bestdim = dim; + } + } + + assert(bestk >= 0); + + // split roughly in half, with the left split always being aligned to cluster size + size_t split = ((count / 2) + cluster_size - 1) / cluster_size * cluster_size; + assert(split > 0 && split < count); + + // mark sides of split for partitioning + unsigned char* sides = static_cast(scratch) + count * sizeof(unsigned int); + + for (size_t i = 0; i < split; ++i) + sides[axes[bestk][i]] = 0; + + for (size_t i = split; i < count; ++i) + sides[axes[bestk][i]] = 1; + + // partition all axes into two sides, maintaining order + unsigned int* temp = static_cast(scratch); + + for (int k = 0; k < 3; ++k) + { + if (k == bestk) + continue; + + unsigned int* axis = axes[k]; + memcpy(temp, axis, sizeof(unsigned int) * count); + partitionPoints(axis, temp, sides, split, count); + } + + splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size); + splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size); +} + } // namespace meshopt void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) @@ -127,22 +224,25 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos meshopt_Allocator allocator; unsigned long long* keys = allocator.allocate(vertex_count); - computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride); + computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ true); - unsigned int hist[1024][5]; - computeHistogram(hist, keys, vertex_count); - - unsigned int* scratch = allocator.allocate(vertex_count); + unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 2b for keys + unsigned short* keyk = (unsigned short*)(scratch + vertex_count); for (size_t i = 0; i < vertex_count; ++i) destination[i] = unsigned(i); + unsigned int* order[] = {scratch, destination}; + // 5-pass radix sort computes the resulting order into scratch - radixPass(scratch, destination, keys, vertex_count, hist, 0); - radixPass(destination, scratch, keys, vertex_count, hist, 1); - radixPass(scratch, destination, keys, vertex_count, hist, 2); - radixPass(destination, scratch, keys, vertex_count, hist, 3); - radixPass(scratch, destination, keys, vertex_count, hist, 4); + for (int k = 0; k < 5; ++k) + { + // copy 10-bit key segments into keyk to reduce cache pressure during radix pass + for (size_t i = 0; i < vertex_count; ++i) + keyk[i] = (unsigned short)((keys[i] >> (k * 10)) & 1023); + + radixSort10(order[k % 2], order[(k + 1) % 2], keyk, vertex_count); + } // since our remap table is mapping old=>new, we need to reverse it for (size_t i = 0; i < vertex_count; ++i) @@ -202,3 +302,39 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* destination[r * 3 + 2] = c; } } + +void meshopt_spatialClusterPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t cluster_size) +{ + using namespace meshopt; + + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(cluster_size > 0); + + meshopt_Allocator allocator; + + unsigned long long* keys = allocator.allocate(vertex_count); + computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride, /* morton= */ false); + + unsigned int* order = allocator.allocate(vertex_count * 3); + unsigned int* scratch = allocator.allocate(vertex_count * 2); // 4b for order + 1b for side or 2b for keys + unsigned short* keyk = reinterpret_cast(scratch + vertex_count); + + for (int k = 0; k < 3; ++k) + { + // copy 16-bit key segments into keyk to reduce cache pressure during radix pass + for (size_t i = 0; i < vertex_count; ++i) + keyk[i] = (unsigned short)(keys[i] >> (k * 20)); + + unsigned int hist[256][2]; + computeHistogram(hist, keyk, vertex_count); + + for (size_t i = 0; i < vertex_count; ++i) + order[k * vertex_count + i] = unsigned(i); + + radixPass(scratch, order + k * vertex_count, keyk, vertex_count, hist, 0); + radixPass(order + k * vertex_count, scratch, keyk, vertex_count, hist, 1); + } + + splitPoints(destination, order, order + vertex_count, order + 2 * vertex_count, keys, vertex_count, scratch, cluster_size); +} diff --git a/3rdparty/meshoptimizer/src/vertexcodec.cpp b/3rdparty/meshoptimizer/src/vertexcodec.cpp index 847589b3d..7085cce32 100644 --- a/3rdparty/meshoptimizer/src/vertexcodec.cpp +++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp @@ -122,7 +122,7 @@ namespace meshopt const unsigned char kVertexHeader = 0xa0; -static int gEncodeVertexVersion = 0; +static int gEncodeVertexVersion = 1; const int kDecodeVertexVersion = 1; const size_t kVertexBlockSizeBytes = 8192; diff --git a/3rdparty/meshoptimizer/src/vertexfilter.cpp b/3rdparty/meshoptimizer/src/vertexfilter.cpp index 6be6b723f..af15d59c6 100644 --- a/3rdparty/meshoptimizer/src/vertexfilter.cpp +++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp @@ -165,6 +165,47 @@ static void decodeFilterExp(unsigned int* data, size_t count) data[i] = u.ui; } } + +template +static void decodeFilterColor(T* data, size_t count) +{ + const float max = float((1 << (sizeof(T) * 8)) - 1); + + for (size_t i = 0; i < count; ++i) + { + // recover scale from alpha high bit + int as = data[i * 4 + 3]; + as |= as >> 1; + as |= as >> 2; + as |= as >> 4; + as |= as >> 8; // noop for 8-bit + + // convert to RGB in fixed point (co/cg are sign extended) + int y = data[i * 4 + 0], co = ST(data[i * 4 + 1]), cg = ST(data[i * 4 + 2]); + + int r = y + co - cg; + int g = y + cg; + int b = y - co - cg; + + // expand alpha by one bit to match other components + int a = data[i * 4 + 3]; + a = ((a << 1) & as) | (a & 1); + + // compute scaling factor + float ss = max / float(as); + + // rounded float->int + int rf = int(float(r) * ss + 0.5f); + int gf = int(float(g) * ss + 0.5f); + int bf = int(float(b) * ss + 0.5f); + int af = int(float(a) * ss + 0.5f); + + data[i * 4 + 0] = T(rf); + data[i * 4 + 1] = T(gf); + data[i * 4 + 2] = T(bf); + data[i * 4 + 3] = T(af); + } +} #endif #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) @@ -386,6 +427,105 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) _mm_storeu_ps(reinterpret_cast(&data[i]), r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + __m128i c4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4])); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + __m128i yf = _mm_and_si128(c4, _mm_set1_epi32(0xff)); + __m128i cof = _mm_srai_epi32(_mm_slli_epi32(c4, 16), 24); + __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4, 8), 24); + __m128i af = _mm_srli_epi32(c4, 24); + + // recover scale from alpha high bit + __m128i as = af; + as = _mm_or_si128(as, _mm_srli_epi32(as, 1)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 2)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 4)); + + // expand alpha by one bit to match other components + af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1))); + + // compute scaling factor + __m128 ss = _mm_mul_ps(_mm_set1_ps(255.f), _mm_rcp_ps(_mm_cvtepi32_ps(as))); + + // convert to RGB in fixed point + __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf)); + __m128i gf = _mm_add_epi32(yf, cgf); + __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf)); + + // rounded signed float->int + __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss)); + __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss)); + __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss)); + __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss)); + + // repack rgba into final value + __m128i res = rr; + res = _mm_or_si128(res, _mm_slli_epi32(gr, 8)); + res = _mm_or_si128(res, _mm_slli_epi32(br, 16)); + res = _mm_or_si128(res, _mm_slli_epi32(ar, 24)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + __m128i c4_0 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4])); + __m128i c4_1 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4])); + + // gather both y/co 16-bit pairs in each 32-bit lane + __m128i c4_yco = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(2, 0, 2, 0))); + __m128i c4_cga = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(c4_0), _mm_castsi128_ps(c4_1), _MM_SHUFFLE(3, 1, 3, 1))); + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + __m128i yf = _mm_and_si128(c4_yco, _mm_set1_epi32(0xffff)); + __m128i cof = _mm_srai_epi32(c4_yco, 16); + __m128i cgf = _mm_srai_epi32(_mm_slli_epi32(c4_cga, 16), 16); + __m128i af = _mm_srli_epi32(c4_cga, 16); + + // recover scale from alpha high bit + __m128i as = af; + as = _mm_or_si128(as, _mm_srli_epi32(as, 1)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 2)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 4)); + as = _mm_or_si128(as, _mm_srli_epi32(as, 8)); + + // expand alpha by one bit to match other components + af = _mm_or_si128(_mm_and_si128(_mm_slli_epi32(af, 1), as), _mm_and_si128(af, _mm_set1_epi32(1))); + + // compute scaling factor + __m128 ss = _mm_div_ps(_mm_set1_ps(65535.f), _mm_cvtepi32_ps(as)); + + // convert to RGB in fixed point + __m128i rf = _mm_add_epi32(yf, _mm_sub_epi32(cof, cgf)); + __m128i gf = _mm_add_epi32(yf, cgf); + __m128i bf = _mm_sub_epi32(yf, _mm_add_epi32(cof, cgf)); + + // rounded signed float->int + __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rf), ss)); + __m128i gr = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(gf), ss)); + __m128i br = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(bf), ss)); + __m128i ar = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(af), ss)); + + // mix r/b and g/a to make 16-bit unpack easier + __m128i rbr = _mm_or_si128(_mm_and_si128(rr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(br, 16)); + __m128i gar = _mm_or_si128(_mm_and_si128(gr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(ar, 16)); + + // pack r/g/b/a using 16-bit unpacks + __m128i res_0 = _mm_unpacklo_epi16(rbr, gar); + __m128i res_1 = _mm_unpackhi_epi16(rbr, gar); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1); + } +} #endif #if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) @@ -596,6 +736,111 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) vst1q_f32(reinterpret_cast(&data[i]), r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + int32x4_t c4 = vld1q_s32(reinterpret_cast(&data[i * 4])); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + int32x4_t yf = vandq_s32(c4, vdupq_n_s32(0xff)); + int32x4_t cof = vshrq_n_s32(vshlq_n_s32(c4, 16), 24); + int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4, 8), 24); + int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4), 24)); + + // recover scale from alpha high bit + int32x4_t as = af; + as = vorrq_s32(as, vshrq_n_s32(as, 1)); + as = vorrq_s32(as, vshrq_n_s32(as, 2)); + as = vorrq_s32(as, vshrq_n_s32(as, 4)); + + // expand alpha by one bit to match other components + af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1))); + + // compute scaling factor + float32x4_t ss = vmulq_f32(vdupq_n_f32(255.f), vrecpeq_f32(vcvtq_f32_s32(as))); + + // convert to RGB in fixed point + int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf)); + int32x4_t gf = vaddq_s32(yf, cgf); + int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); + int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); + int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); + int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + + // repack rgba into final value + int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff)); + res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8)); + res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16)); + res = vorrq_s32(res, vshlq_n_s32(ar, 24)); + + vst1q_s32(reinterpret_cast(&data[i * 4]), res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + int32x4_t c4_0 = vld1q_s32(reinterpret_cast(&data[(i + 0) * 4])); + int32x4_t c4_1 = vld1q_s32(reinterpret_cast(&data[(i + 2) * 4])); + + // gather both y/co 16-bit pairs in each 32-bit lane + int32x4_t c4_yco = vuzpq_s32(c4_0, c4_1).val[0]; + int32x4_t c4_cga = vuzpq_s32(c4_0, c4_1).val[1]; + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + int32x4_t yf = vandq_s32(c4_yco, vdupq_n_s32(0xffff)); + int32x4_t cof = vshrq_n_s32(c4_yco, 16); + int32x4_t cgf = vshrq_n_s32(vshlq_n_s32(c4_cga, 16), 16); + int32x4_t af = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(c4_cga), 16)); + + // recover scale from alpha high bit + int32x4_t as = af; + as = vorrq_s32(as, vshrq_n_s32(as, 1)); + as = vorrq_s32(as, vshrq_n_s32(as, 2)); + as = vorrq_s32(as, vshrq_n_s32(as, 4)); + as = vorrq_s32(as, vshrq_n_s32(as, 8)); + + // expand alpha by one bit to match other components + af = vorrq_s32(vandq_s32(vshlq_n_s32(af, 1), as), vandq_s32(af, vdupq_n_s32(1))); + + // compute scaling factor + float32x4_t ss = vdivq_f32(vdupq_n_f32(65535.f), vcvtq_f32_s32(as)); + + // convert to RGB in fixed point + int32x4_t rf = vaddq_s32(yf, vsubq_s32(cof, cgf)); + int32x4_t gf = vaddq_s32(yf, cgf); + int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); + int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); + int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); + int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + + // mix r/b and g/a to make 16-bit unpack easier + int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16)); + int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16)); + + // pack r/g/b/a using 16-bit unpacks + int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]); + int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[1]); + + vst1q_s32(reinterpret_cast(&data[(i + 0) * 4]), res_0); + vst1q_s32(reinterpret_cast(&data[(i + 2) * 4]), res_1); + } +} #endif #ifdef SIMD_WASM @@ -651,7 +896,8 @@ static void decodeFilterOctSimd8(signed char* data, size_t count) static void decodeFilterOctSimd16(short* data, size_t count) { const v128_t sign = wasm_f32x4_splat(-0.f); - const v128_t zmask = wasm_i32x4_splat(0x7fff); + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zmask = wasm_i32x4_splat(0x7fff); for (size_t i = 0; i < count; i += 4) { @@ -763,8 +1009,7 @@ static void decodeFilterQuatSimd(short* data, size_t count) v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr); // compute component index shifted left by 4 (and moved into i32x4 slot) - // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449 - volatile v128_t cm = wasm_i32x4_shl(cf, 4); + v128_t cm = wasm_i32x4_shl(cf, 4); // rotate and store uint64_t* out = reinterpret_cast(&data[i * 4]); @@ -795,6 +1040,117 @@ static void decodeFilterExpSimd(unsigned int* data, size_t count) wasm_v128_store(&data[i], r); } } + +static void decodeFilterColorSimd8(unsigned char* data, size_t count) +{ + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zero = wasm_i32x4_splat(0); + + for (size_t i = 0; i < count; i += 4) + { + v128_t c4 = wasm_v128_load(&data[i * 4]); + + // unpack y/co/cg/a (co/cg are sign extended with arithmetic shifts) + v128_t yf = wasm_v128_and(c4, wasm_i32x4_splat(0xff)); + v128_t cof = wasm_i32x4_shr(wasm_i32x4_shl(c4, 16), 24); + v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4, 8), 24); + v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4, 24)); + + // recover scale from alpha high bit + v128_t as = af; + as = wasm_v128_or(as, wasm_i32x4_shr(as, 1)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 2)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 4)); + + // expand alpha by one bit to match other components + af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1))); + + // compute scaling factor + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(255.f), wasm_f32x4_convert_i32x4(as)); + + // convert to RGB in fixed point + v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf)); + v128_t gf = wasm_i32x4_add(yf, cgf); + v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap); + v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap); + v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap); + v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap); + + // repack rgba into final value + v128_t res = wasm_v128_and(rr, wasm_i32x4_splat(0xff)); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(gr, wasm_i32x4_splat(0xff)), 8)); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(br, wasm_i32x4_splat(0xff)), 16)); + res = wasm_v128_or(res, wasm_i32x4_shl(ar, 24)); + + wasm_v128_store(&data[i * 4], res); + } +} + +static void decodeFilterColorSimd16(unsigned short* data, size_t count) +{ + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/149457 + volatile v128_t zero = wasm_i32x4_splat(0); + + for (size_t i = 0; i < count; i += 4) + { + v128_t c4_0 = wasm_v128_load(&data[(i + 0) * 4]); + v128_t c4_1 = wasm_v128_load(&data[(i + 2) * 4]); + + // gather both y/co 16-bit pairs in each 32-bit lane + v128_t c4_yco = wasmx_unziplo_v32x4(c4_0, c4_1); + v128_t c4_cga = wasmx_unziphi_v32x4(c4_0, c4_1); + + // unpack y/co/cg/a components (co/cg are sign extended with arithmetic shifts) + v128_t yf = wasm_v128_and(c4_yco, wasm_i32x4_splat(0xffff)); + v128_t cof = wasm_i32x4_shr(c4_yco, 16); + v128_t cgf = wasm_i32x4_shr(wasm_i32x4_shl(c4_cga, 16), 16); + v128_t af = wasm_v128_or(zero, wasm_u32x4_shr(c4_cga, 16)); + + // recover scale from alpha high bit + v128_t as = af; + as = wasm_v128_or(as, wasm_i32x4_shr(as, 1)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 2)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 4)); + as = wasm_v128_or(as, wasm_i32x4_shr(as, 8)); + + // expand alpha by one bit to match other components + af = wasm_v128_or(wasm_v128_and(wasm_i32x4_shl(af, 1), as), wasm_v128_and(af, wasm_i32x4_splat(1))); + + // compute scaling factor + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(65535.f), wasm_f32x4_convert_i32x4(as)); + + // convert to RGB in fixed point + v128_t rf = wasm_i32x4_add(yf, wasm_i32x4_sub(cof, cgf)); + v128_t gf = wasm_i32x4_add(yf, cgf); + v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap); + v128_t gr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(gf), ss), fsnap); + v128_t br = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(bf), ss), fsnap); + v128_t ar = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(af), ss), fsnap); + + // mix r/b and g/a to make 16-bit unpack easier + v128_t rbr = wasm_v128_or(wasm_v128_and(rr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(br, 16)); + v128_t gar = wasm_v128_or(wasm_v128_and(gr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(ar, 16)); + + // pack r/g/b/a using 16-bit unpacks + v128_t res_0 = wasmx_unpacklo_v16x8(rbr, gar); + v128_t res_1 = wasmx_unpackhi_v16x8(rbr, gar); + + wasm_v128_store(&data[(i + 0) * 4], res_0); + wasm_v128_store(&data[(i + 2) * 4], res_1); + } +} #endif // optimized variant of frexp @@ -872,6 +1228,25 @@ void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride) #endif } +void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride) +{ + using namespace meshopt; + + assert(stride == 4 || stride == 8); + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + if (stride == 4) + dispatchSimd(decodeFilterColorSimd8, static_cast(buffer), count, 4); + else + dispatchSimd(decodeFilterColorSimd16, static_cast(buffer), count, 4); +#else + if (stride == 4) + decodeFilterColor(static_cast(buffer), count); + else + decodeFilterColor(static_cast(buffer), count); +#endif +} + void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data) { assert(stride == 4 || stride == 8); @@ -1042,6 +1417,51 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in } } +void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data) +{ + assert(stride == 4 || stride == 8); + assert(bits >= 2 && bits <= 16); + + unsigned char* d8 = static_cast(destination); + unsigned short* d16 = static_cast(destination); + + for (size_t i = 0; i < count; ++i) + { + const float* c = &data[i * 4]; + + int fr = meshopt_quantizeUnorm(c[0], bits); + int fg = meshopt_quantizeUnorm(c[1], bits); + int fb = meshopt_quantizeUnorm(c[2], bits); + + // YCoCg-R encoding with truncated Co/Cg ensures that decoding can be done using integers + int fco = (fr - fb) / 2; + int tmp = fb + fco; + int fcg = (fg - tmp) / 2; + int fy = tmp + fcg; + + // validate that R/G/B can be reconstructed with K bit integers + assert(unsigned((fy + fco - fcg) | (fy + fcg) | (fy - fco - fcg)) < (1u << bits)); + + // alpha: K-1-bit encoding with high bit set to 1 + int fa = meshopt_quantizeUnorm(c[3], bits - 1) | (1 << (bits - 1)); + + if (stride == 4) + { + d8[i * 4 + 0] = (unsigned char)(fy); + d8[i * 4 + 1] = (unsigned char)(fco); + d8[i * 4 + 2] = (unsigned char)(fcg); + d8[i * 4 + 3] = (unsigned char)(fa); + } + else + { + d16[i * 4 + 0] = (unsigned short)(fy); + d16[i * 4 + 1] = (unsigned short)(fco); + d16[i * 4 + 2] = (unsigned short)(fcg); + d16[i * 4 + 3] = (unsigned short)(fa); + } + } +} + #undef SIMD_SSE #undef SIMD_NEON #undef SIMD_WASM