Updated meshoptimizer.

2026-02-17 20:52:36 +01:00 · 2025-02-08 09:35:58 -08:00
parent b109f36cbf
commit c8206d6956
4 changed files with 178 additions and 78 deletions
--- a/3rdparty/meshoptimizer/LICENSE.md
+++ b/3rdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2016-2024 Arseny Kapoulkine
+Copyright (c) 2016-2025 Arseny Kapoulkine

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@@ -70,11 +70,77 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
 		assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
 		adjacency.offsets[i] -= adjacency.counts[i];
 	}
 }

+static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
+	const unsigned int sparse_seen = 1u << 31;
+	assert(index_count < sparse_seen);
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	for (size_t i = 0; i < index_count; ++i)
+		assert(indices[i] < vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]] = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]]++;
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	// when using sparse mode this pass uses sparse_seen bit to tag visited vertices
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if ((adjacency.counts[v] & sparse_seen) == 0)
+		{
+			adjacency.offsets[v] = offset;
+			offset += adjacency.counts[v];
+			adjacency.counts[v] |= sparse_seen;
+		}
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	// when using sparse mode this pass also fixes counts (that were marked with sparse_seen)
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if (adjacency.counts[v] & sparse_seen)
+		{
+			adjacency.counts[v] &= ~sparse_seen;
+
+			assert(adjacency.offsets[v] >= adjacency.counts[v]);
+			adjacency.offsets[v] -= adjacency.counts[v];
+		}
+	}
+}
+
 static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
 {
 	assert(count > 0);
@@ -552,10 +618,13 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	meshopt_Allocator allocator;

 	TriangleAdjacency2 adjacency = {};
-	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+	if (vertex_count > index_count && index_count < (1u << 31))
+		buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
+	else
+		buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);

-	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
-	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+	// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+	unsigned int* live_triangles = adjacency.counts;

 	size_t face_count = index_count / 3;

@@ -625,12 +694,9 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}

-		live_triangles[a]--;
-		live_triangles[b]--;
-		live_triangles[c]--;
-
 		// remove emitted triangle from adjacency data
 		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		// live triangle counts are updated as a byproduct of these adjustments
 		for (size_t k = 0; k < 3; ++k)
 		{
 			unsigned int index = indices[best_triangle * 3 + k];
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
 * meshoptimizer - version 0.22
 *
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
 *
 * This library is distributed under the MIT License. See notice at the end of this file.
@@ -289,7 +289,7 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer,

 /**
 * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
 */
 MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);

@@ -608,34 +608,6 @@ MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const
 */
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

-/**
- * Set allocation callbacks
- * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
- * Note that all algorithms only allocate memory for temporary use.
- * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
- */
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-/* Quantization into commonly supported data formats */
-#ifdef __cplusplus
-/**
- * Quantize a float in [0..1] range into an N-bit fixed point unorm value
- * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
- * Maximum reconstruction error: 1/2^(N+1)
- */
-inline int meshopt_quantizeUnorm(float v, int N);
-
-/**
- * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
- * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
- * Maximum reconstruction error: 1/2^N
- */
-inline int meshopt_quantizeSnorm(float v, int N);
-
 /**
 * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
 * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
@@ -656,6 +628,34 @@ MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
 * Preserves Inf/NaN, flushes denormals to zero
 */
 MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
 #endif

 /**
@@ -1123,7 +1123,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif

 /**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@@ -138,12 +138,9 @@ const int kEncodeDefaultLevel = 2;

 static size_t getVertexBlockSize(size_t vertex_size)
 {
-	// make sure the entire block fits into the scratch buffer
-	size_t result = kVertexBlockSizeBytes / vertex_size;
-
-	// align to byte group size; we encode each byte as a byte group
-	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
-	result &= ~(kByteGroupSize - 1);
+	// make sure the entire block fits into the scratch buffer and is aligned to byte group size
+	// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
+	size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);

 	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
 }
@@ -179,13 +176,14 @@ static Stats* bytestats = NULL;
 static Stats vertexstats[256];
 #endif

-static bool canEncodeZero(const unsigned char* buffer, size_t buffer_size)
+static bool encodeBytesGroupZero(const unsigned char* buffer)
 {
-	for (size_t i = 0; i < buffer_size; ++i)
-		if (buffer[i])
-			return false;
+	assert(kByteGroupSize == sizeof(unsigned long long) * 2);

-	return true;
+	unsigned long long v[2];
+	memcpy(v, buffer, sizeof(v));
+
+	return (v[0] | v[1]) == 0;
 }

 static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
@@ -193,7 +191,7 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 	assert(bits >= 0 && bits <= 8);

 	if (bits == 0)
-		return canEncodeZero(buffer, kByteGroupSize) ? 0 : size_t(-1);
+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);

 	if (bits == 8)
 		return kByteGroupSize;
@@ -389,6 +387,11 @@ static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count,
 			vertex += vertex_size;
 		}

+#if TRACE
+		for (int j = 0; j < 32; ++j)
+			vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
+#endif
+
 		for (int j = 0; j < 8; ++j)
 		{
 			unsigned int bitr = rotate(bitg, j);
@@ -455,9 +458,18 @@ static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count
 	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
 }

+static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
+{
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+		if (!encodeBytesGroupZero(buffer + i))
+			return false;
+
+	return true;
+}
+
 static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
 {
-	if (canEncodeZero(buffer, vertex_count))
+	if (estimateControlZero(buffer, vertex_count_aligned))
 		return 2; // zero encoding

 	if (level == 0)
@@ -522,18 +534,6 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 #if TRACE
 		const unsigned char* olddata = data;
 		bytestats = &vertexstats[k];
-
-		for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize)
-		{
-			unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k];
-			unsigned char delta = 0xff;
-
-			for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i)
-				delta &= ~(vertex_data[vertex_size * i + k] ^ last);
-
-			for (int j = 0; j < 8; ++j)
-				bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1);
-		}
 #endif

 		int ctrl = 0;
@@ -1349,6 +1349,34 @@ inline uint8x16_t rotate32(uint8x16_t v, int r)
 	uint32x4_t v32 = vreinterpretq_u32_u8(v);
 	return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
 }
+
+template <int Channel>
+SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
+{
+	switch (Channel)
+	{
+	case 0:
+	{
+		uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
+		uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	case 1:
+	{
+		uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
+		uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
+		return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
+	}
+	case 2:
+	{
+		uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
+		uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	default:
+		return npi;
+	}
+}
 #endif

 #ifdef SIMD_WASM
@@ -1443,7 +1471,7 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t
 #define TEMP __m128i
 #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
 #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
-#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
 #define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
 #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
 #endif
@@ -1461,9 +1489,9 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t
 #define TEMP v128_t
 #define PREP() v128_t pi = wasm_v128_load(last_vertex)
 #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
 #define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
-#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
 #endif

 #define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
@@ -1482,6 +1510,7 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t
 		transpose8(r0, r1, r2, r3);

 		TEMP t0, t1, t2, t3;
+		TEMP npi = pi;

 		UNZR(0);
 		GRP4(0);
@@ -1503,6 +1532,13 @@ decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t
 		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
 		SAVE(0), SAVE(1), SAVE(2), SAVE(3);

+#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
+		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
+		pi = rebase<Channel>(npi, r0, r1, r2, r3);
+#else
+		(void)npi;
+#endif
+
 #undef UNZR
 #undef TEMP
 #undef PREP
@@ -1724,13 +1760,12 @@ size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size
 			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
 		}

-#if TRACE > 1
-		printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
-		    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
-		    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
-		    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
-		    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
-#endif
+		if (level >= 3)
+			printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+			    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
+			    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
+			    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
+			    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);

 		printf("\n");
 	}
@@ -1768,8 +1803,7 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)

 void meshopt_encodeVertexVersion(int version)
 {
-	// note: this version is experimental and the binary format is not finalized; this should not be used in production!
-	assert(unsigned(version) <= 0 || version == 0xe);
+	assert(unsigned(version) <= 1);

 	meshopt::gEncodeVertexVersion = version;
 }
@@ -1810,7 +1844,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 		return -1;

 	int version = data_header & 0x0f;
-	if (version > 0 && version != 0xe)
+	if (version > 1)
 		return -1;

 	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);